def _from_jvm_time_type(jvm_type): """ Convert a JVM time type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Time Returns ------- typ: pyarrow.DataType """ time_unit = jvm_type.getUnit().toString() if time_unit == 'SECOND': assert jvm_type.bitWidth == 32 return pa.time32('s') elif time_unit == 'MILLISECOND': assert jvm_type.bitWidth == 32 return pa.time32('ms') elif time_unit == 'MICROSECOND': assert jvm_type.bitWidth == 64 return pa.time64('us') elif time_unit == 'NANOSECOND': assert jvm_type.bitWidth == 64 return pa.time64('ns')
def test_time64_units(): for valid_unit in ('us', 'ns'): ty = pa.time64(valid_unit) assert ty.unit == valid_unit for invalid_unit in ('m', 's', 'ms'): error_msg = 'Invalid TimeUnit for time64: {}'.format(invalid_unit) with pytest.raises(ValueError, match=error_msg): pa.time64(invalid_unit)
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]']) _check_roundtrip(table, expected=expected, version='2.0') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def test_arrow_time_to_pandas(self): pytimes = [time(1, 2, 3, 1356), time(4, 5, 6, 1356), time(0, 0, 0)] expected = np.array(pytimes[:2] + [None]) expected_ms = np.array([x.replace(microsecond=1000) for x in pytimes[:2]] + [None]) expected_s = np.array([x.replace(microsecond=0) for x in pytimes[:2]] + [None]) arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') null_mask = np.array([False, False, True], dtype=bool) a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) a2 = pa.array(arr * 1000, mask=null_mask, type=pa.time64('ns')) a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, type=pa.time32('ms')) a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, type=pa.time32('s')) names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) arr = a1.to_pandas() assert (arr == expected).all() arr = a2.to_pandas() assert (arr == expected).all() arr = a3.to_pandas() assert (arr == expected_ms).all() arr = a4.to_pandas() assert (arr == expected_s).all() df = batch.to_pandas() expected_df = pd.DataFrame({'time64[us]': expected, 'time64[ns]': expected, 'time32[ms]': expected_ms, 'time32[s]': expected_s}, columns=names) tm.assert_frame_equal(df, expected_df)
def test_cast_time64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.time64('us')) expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_time_types(): t1 = pa.time32('s') t2 = pa.time32('ms') t3 = pa.time64('us') t4 = pa.time64('ns') assert t1.unit == 's' assert t2.unit == 'ms' assert t3.unit == 'us' assert t4.unit == 'ns' assert str(t1) == 'time32[s]' assert str(t4) == 'time64[ns]' with pytest.raises(ValueError): pa.time32('us') with pytest.raises(ValueError): pa.time64('s')
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_pytime_from_pandas(self): pytimes = [time(1, 2, 3, 1356), time(4, 5, 6, 1356)] # microseconds t1 = pa.time64('us') aobjs = np.array(pytimes + [None], dtype=object) parr = pa.array(aobjs) assert parr.type == t1 assert parr[0].as_py() == pytimes[0] assert parr[1].as_py() == pytimes[1] assert parr[2] is pa.NA # DataFrame df = pd.DataFrame({'times': aobjs}) batch = pa.RecordBatch.from_pandas(df) assert batch[0].equals(parr) # Test ndarray of int64 values arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64') a1 = pa.array(arr, type=pa.time64('us')) assert a1[0].as_py() == pytimes[0] a2 = pa.array(arr * 1000, type=pa.time64('ns')) assert a2[0].as_py() == pytimes[0] a3 = pa.array((arr / 1000).astype('i4'), type=pa.time32('ms')) assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) a4 = pa.array((arr / 1000000).astype('i4'), type=pa.time32('s')) assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())
pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t)
]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([ pa.date32(), pa.date64() ]) time_types = st.sampled_from([ pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns') ]) timestamp_types = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) temporal_types = st.one_of(date_types, time_types, timestamp_types) primitive_types = st.one_of( null_type, bool_type, binary_type, string_type, numeric_types,
(pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.Array.from_pandas(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.Array.from_pandas(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.Array.from_pandas(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.Array.from_pandas(data4, type=t4) t5 = pa.time64('us') a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.Array.from_pandas(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.Array.from_pandas(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.Array.from_pandas(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) _assert_unsupported(a7)
ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes pyarrow.bool_().id: "BOOL", pyarrow.int8().id: "INT64", pyarrow.int16().id: "INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances. } else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
) # order is important; _string_like[:2] vs _string_like[::2] _string_like = ( pyarrow.string(), pyarrow.large_string(), pyarrow.binary(), pyarrow.large_binary(), ) _pyarrow_to_numpy_dtype = { pyarrow.date32(): (True, np.dtype("M8[D]")), pyarrow.date64(): (False, np.dtype("M8[ms]")), pyarrow.time32("s"): (True, np.dtype("M8[s]")), pyarrow.time32("ms"): (True, np.dtype("M8[ms]")), pyarrow.time64("us"): (False, np.dtype("M8[us]")), pyarrow.time64("ns"): (False, np.dtype("M8[ns]")), pyarrow.timestamp("s"): (False, np.dtype("M8[s]")), pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")), pyarrow.timestamp("us"): (False, np.dtype("M8[us]")), pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")), pyarrow.duration("s"): (False, np.dtype("m8[s]")), pyarrow.duration("ms"): (False, np.dtype("m8[ms]")), pyarrow.duration("us"): (False, np.dtype("m8[us]")), pyarrow.duration("ns"): (False, np.dtype("m8[ns]")), } if not ak._v2._util.numpy_at_least("1.17.0"): def packbits(bytearray, lsb_order=True): if lsb_order:
def test_in_expr_todo(): import pyarrow.gandiva as gandiva # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: # Evaluation expression for IN clause returns XXXX values are of typeXXXX # binary arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array([1, 2], type=pa.uint32())) # timestamp datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877) datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877) datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877) arr = pa.array([datetime_1, datetime_2, datetime_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # time time_1 = datetime_1.time() time_2 = datetime_2.time() time_3 = datetime_3.time() arr = pa.array([time_1, time_2, time_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # date date_1 = datetime_1.date() date_2 = datetime_2.date() date_3 = datetime_3.date() arr = pa.array([date_1, date_2, date_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [date_2], pa.date32()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1]
def pyarrow_time(): return pyarrow.time64("us")
(pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
"INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances.
def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. parquet_compatible: bool Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] date_data = [ [], [date(2018, 1, 1), date(2032, 12, 30)], [date(2000, 6, 7)], None, [date(1969, 6, 9), date(1972, 7, 3)] ] time_data = [ [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], [], [time(22, 5, 59)], None, [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] ] temporal_pairs = [ (pa.date32(), date_data), (pa.date64(), date_data), (pa.time32('s'), time_data), (pa.time32('ms'), time_data), (pa.time64('us'), time_data) ] if not parquet_compatible: temporal_pairs += [ (pa.time64('ns'), time_data), ] for value_type, data in temporal_pairs: field_name = '{}_list'.format(value_type) field_type = pa.list_(value_type) field = pa.field(field_name, field_type) fields.append(field) arrays[field_name] = data if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8')) assert arr.equals(expected) def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr) arr2 = pa.array(arr) # noqa assert sys.getrefcount(arr) == (refcount + 1)
(pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8')) assert arr.equals(expected) def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr)
unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.builds(pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones()) duration_types = st.builds(pa.duration, st.sampled_from(['s', 'ms', 'us', 'ns'])) temporal_types = st.one_of(date_types, time_types, timestamp_types, duration_types) primitive_types = st.one_of(null_type, bool_type, binary_type, string_type, large_binary_type, large_string_type, numeric_types, temporal_types) metadata = st.dictionaries(st.text(), st.text())
# specific language governing permissions and limitations # under the License. import pickle import pytest import pyarrow as pa import pyarrow.types as types MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()),
def test_date_time_types(tempdir): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.0') t0 = pa.timestamp('ms') data0 = np.arange(4, dtype='int64') a0 = pa.array(data0, type=t0) t1 = pa.timestamp('us') data1 = np.arange(4, dtype='int64') a1 = pa.array(data1, type=t1) t2 = pa.timestamp('ns') data2 = np.arange(4, dtype='int64') a2 = pa.array(data2, type=t2) table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) expected = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int64 for all timestamps supported by default filename = tempdir / 'int64_timestamps.parquet' _write_table(table, filename, version='2.0') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT64' read_table = _read_table(filename) assert read_table.equals(expected) t0_ns = pa.timestamp('ns') data0_ns = np.array(data0 * 1000000, dtype='int64') a0_ns = pa.array(data0_ns, type=t0_ns) t1_ns = pa.timestamp('ns') data1_ns = np.array(data1 * 1000, dtype='int64') a1_ns = pa.array(data1_ns, type=t1_ns) expected = pa.Table.from_arrays([a0_ns, a1_ns, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int96 nanosecond timestamps produced upon request filename = tempdir / 'explicit_int96_timestamps.parquet' _write_table(table, filename, version='2.0', use_deprecated_int96_timestamps=True) parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected) # int96 nanosecond timestamps implied by flavor 'spark' filename = tempdir / 'spark_int96_timestamps.parquet' _write_table(table, filename, version='2.0', flavor='spark') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected)
def test_date_time_types(): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2000-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) t7_us = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value data7_us = np.array([start, start + 1000, start + 2000], dtype='int64') // 1000 a7_us = pa.array(data7_us, type=t7_us) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' to 'timestamp[us]' expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0') # date64 as date32 # time32[s] to time32[ms] # 'timestamp[ns]' is saved as INT96 timestamp expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0', use_deprecated_int96_timestamps=True) # Check that setting flavor to 'spark' uses int96 timestamps _check_roundtrip(table, expected=expected, version='2.0', flavor='spark') # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported']) buf = io.BytesIO() with pytest.raises(NotImplementedError): _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.array(data4.astype('int64'), type=t7) _assert_unsupported(a7)
def main(): # https://arrow.apache.org/docs/python/api/datatypes.html my_schema = pa.schema([ # skip null ('c_bool', pa.bool_()), ('c_int8', pa.int8()), ('c_int16', pa.int16()), ('c_int32', pa.int32()), ('c_int64', pa.int64()), ('c_uint8', pa.uint8()), ('c_uint16', pa.uint16()), ('c_uint32', pa.uint32()), ('c_uint64', pa.uint64()), # skip ('c_float16', pa.float16()), ('c_float32', pa.float32()), ('c_float64', pa.float64()), ('c_time32', pa.time32('ms')), ('c_time64', pa.time64('ns')), ('c_timestamp', pa.timestamp('ms')), ('c_date32', pa.date32()), ('c_date64', pa.date64()), # skip binary ('c_string', pa.string()), # skip utf8 # skip large_binary # skip large_string # skip large_utf8 ('c_decimal128_8_3', pa.decimal128(8, 3)) # skip list_ # skip large_list # skip struct # skip dictionary # skip field # skip schema # skip from_numpy_dtype ]) c_bool = pa.array([False, True, False], type=pa.bool_()) c_int8 = pa.array([1, 2, 3], type=pa.int8()) c_int16 = pa.array([1, 2, 3], type=pa.int16()) c_int32 = pa.array([1, 2, 3], type=pa.int32()) c_int64 = pa.array([1, 2, 3], type=pa.int64()) c_uint8 = pa.array([1, 2, 3], type=pa.uint8()) c_uint16 = pa.array([1, 2, 3], type=pa.uint16()) c_uint32 = pa.array([1, 2, 3], type=pa.uint32()) c_uint64 = pa.array([1, 2, 3], type=pa.uint64()) # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16()) c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32()) c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64()) c_time32 = pa.array([1, 2, 3], type=pa.time32('ms')) c_time64 = pa.array([1, 2, 3], type=pa.time64('ns')) c_timestamp = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.timestamp('ms')) c_date32 = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.date32()) c_date64 = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.date64()) c_string = pa.array( ['*****@*****.**', '*****@*****.**', '*****@*****.**'], type=pa.string() ) c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3)) batch = pa.RecordBatch.from_arrays( [c_bool, c_int8, c_int16, c_int32, c_int64, c_uint8, c_uint16, c_uint32, c_uint64, # c_float16, c_float32, c_float64, c_time32, c_time64, c_timestamp, c_date32, c_date64, c_string, c_decimal128_8_3 ], schema=my_schema ) table = pa.Table.from_batches([batch]) pq.write_table(table, 'example.parquet')
pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ] SIGNED_INT_PYARROW_DTYPES = [ pa.uint8(), pa.int16(), pa.int32(), pa.uint64() ] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] TIME_PYARROW_DTYPES = [ pa.time32("s"), pa.time32("ms"), pa.time64("us"), pa.time64("ns"), ] DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] DATETIME_PYARROW_DTYPES = [ pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] TIMEDELTA_PYARROW_DTYPES = [ pa.duration(unit) for unit in ["s", "ms", "us", "ns"] ] BOOL_PYARROW_DTYPES = [pa.bool_()] # TODO: Add container like pyarrow types: # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
("int8", pa.int8()), ("int16", pa.int16()), ("int32", pa.int32()), ("int64", pa.int64()), ("uint8", pa.uint8()), ("uint16", pa.uint16()), ("uint32", pa.uint32()), ("uint64", pa.uint64()), ("float16", pa.float16()), ("float32", pa.float32()), ("float64", pa.float64()), ("decimal128(38,1)", pa.decimal128(38, 1)), ("decimal128(1,2)", pa.decimal128(1, 2)), ("time32(s)", pa.time32("s")), ("time32(ms)", pa.time32("ms")), ("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")), ("timestamp(ns)", pa.timestamp("ns")), ("date32", pa.date32()), ("date64", pa.date64()), ("string", pa.string()), ("large_string", pa.large_string()), ("utf8", pa.utf8()), ("large_utf8", pa.large_utf8()), ("binary", pa.binary()), ("binary(128)", pa.binary(128)), ("large_binary", pa.large_binary()), ("struct<num:int64>", pa.struct([("num", pa.int64())])),