def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_uint64_max_convert(): data = [0, np.iinfo(np.uint64).max] arr = pa.array(data, type=pa.uint64()) expected = pa.array(np.array(data, dtype='uint64')) assert arr.equals(expected) arr_inferred = pa.array(data) assert arr_inferred.equals(expected)
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_integer_no_nulls(self): data = OrderedDict() fields = [] numpy_dtypes = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('longlong', pa.int64()), ('ulonglong', pa.uint64()) ] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(max(info.min, np.iinfo(np.int_).min), min(info.max, np.iinfo(np.int_).max), size=num_values) data[dtype] = values.astype(dtype) fields.append(pa.field(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] for t in signed_ints + unsigned_ints: assert types.is_integer(t) for t in signed_ints: assert types.is_signed_integer(t) assert not types.is_unsigned_integer(t) for t in unsigned_ints: assert types.is_unsigned_integer(t) assert not types.is_signed_integer(t) assert not types.is_integer(pa.float32()) assert not types.is_signed_integer(pa.float32())
def test_integer_no_nulls(self): data = {} fields = [] numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()), ('i4', A.int32()), ('i8', A.int64()), ('u1', A.uint8()), ('u2', A.uint16()), ('u4', A.uint32()), ('u8', A.uint64())] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(info.min, min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) fields.append(A.Field.from_py(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = A.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def check_struct_type(ty, expected): """
def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) n = sys.getrefcount(tensor) array = tensor.to_numpy() assert sys.getrefcount(tensor) == n + 1 @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ]) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) tensor = pa.Tensor.from_numpy(data) assert tensor.type == arrow_type repr(tensor) result = tensor.to_numpy() assert (data == result).all()
from google.protobuf import text_format from absl.testing import absltest from absl.testing import parameterized from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import from tensorflow_metadata.proto.v0 import schema_pb2 _ALL_SUPPORTED_INT_VALUE_TYPES = [ pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), ] _ALL_SUPPORTED_FLOATING_VALUE_TYPES = [pa.float32(), pa.float64()] _ALL_SUPPORTED_STRING_VALUE_TYPES = [ pa.binary(), pa.large_binary(), pa.string(), pa.large_string() ] _ALL_SUPPORTED_VALUE_TYPES = (_ALL_SUPPORTED_INT_VALUE_TYPES + _ALL_SUPPORTED_FLOATING_VALUE_TYPES + _ALL_SUPPORTED_STRING_VALUE_TYPES) _ARROW_TYPE_TO_TF_TYPE = { pa.int8(): tf.int8, pa.int16(): tf.int16, pa.int32(): tf.int32,
def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8')) assert arr.equals(expected)
assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ]
if hasattr(func, '__arrow_compute_function__') ] exported_option_classes = [ cls for (name, cls) in sorted(pc.__dict__.items()) if (isinstance(cls, type) and cls is not pc.FunctionOptions and issubclass(cls, pc.FunctionOptions)) ] numerical_arrow_types = [ pa.int8(), pa.int16(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint64(), pa.float32(), pa.float64() ] def test_exported_functions(): # Check that all exported concrete functions can be called with # the right number of arguments. # Note that unregistered functions (e.g. with a mismatching name) # will raise KeyError. functions = exported_functions assert len(functions) >= 10 for func in functions: args = [object()] * func.__arrow_compute_function__['arity'] with pytest.raises(TypeError,
input_schema = pa.schema([ pa.field("input", pa.uint8(), False).with_metadata({b'fletcher_epc': b'8'}) ]).with_metadata({ b'fletcher_mode': b'read', b'fletcher_name': b'input' }) pa.output_stream("in.as").write(input_schema.serialize()) with pa.RecordBatchFileWriter('in.rb', input_schema) as writer: writer.write( pa.RecordBatch.from_arrays( [pa.array( [byte for byte in '{"voltage":[1128,1213,1850,429,1770,1683,1483,478,545,1555,867,1495,1398,1380,1753,438]}\n'.encode()], pa.uint8())], schema=input_schema) ) output_schema = pa.schema([ pa.field("voltage", pa.list_( pa.field("item", pa.uint64(), False).with_metadata( {"illex_MIN": "0", "illex_MAX": "2047"}) ), False).with_metadata( {"illex_MIN_LENGTH": "1", "illex_MAX_LENGTH": "16"} ) ]).with_metadata({ b'fletcher_mode': b'write', b'fletcher_name': b'output' }) pa.output_stream("out.as").write(output_schema.serialize())
from tfx_bsl.tfxio import tensor_adapter from tfx_bsl.tfxio import tensor_to_arrow from google.protobuf import text_format from absl.testing import absltest from absl.testing import parameterized from tensorflow_metadata.proto.v0 import schema_pb2 _TF_TYPE_TO_ARROW_TYPE = { tf.int8: pa.int8(), tf.int16: pa.int16(), tf.int32: pa.int32(), tf.int64: pa.int64(), tf.uint8: pa.uint8(), tf.uint16: pa.uint16(), tf.uint32: pa.uint32(), tf.uint64: pa.uint64(), tf.float32: pa.float32(), tf.float64: pa.float64(), tf.string: pa.large_binary(), } _ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32} def _make_2d_dense_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string: tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string) expected_array = pa.array([[b"1", b"2"], [b"3", b"4"]], type=pa.large_list(arrow_type))
import cudf from cudf._lib.scalar import DeviceScalar from cudf.core._compat import PANDAS_GE_120 _NA_REP = "<NA>" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(),
from .driver import Driver from .coord import coord2pos, pos2coord __version__ = '19.11.2' type_map_pyarrow = dict( [(t.__str__(), t) for t in (pyarrow.binary(), pyarrow.bool_(), pyarrow.int16(), pyarrow.int32(), pyarrow.int64(), pyarrow.int8(), pyarrow.string(), pyarrow.uint16(), pyarrow.uint32(), pyarrow.uint64(), pyarrow.uint8())] + [('char', pyarrow.string()), ('datetime', pyarrow.timestamp('s')), ('double', pyarrow.float64()), ('float', pyarrow.float32())]) class Array(object): """Wrapper for SciDB array stored externally""" def __init__(self, url): self.url = url self._metadata = None self._schema = None
"NUMERIC": pyarrow_numeric, "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes pyarrow.bool_().id: "BOOL", pyarrow.int8().id: "INT64", pyarrow.int16().id: "INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() # The exact scale and precision don't matter, see below. pyarrow.decimal128(38, scale=9).id: "NUMERIC", } if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
bool: pa.bool_(), date: pa.date32(), time: pa.time64("us"), datetime: pa.timestamp("us"), timedelta: pa.duration("us"), } _DTYPE_TO_ARROW_TYPE = { Int8: pa.int8(), Int16: pa.int16(), Int32: pa.int32(), Int64: pa.int64(), UInt8: pa.uint8(), UInt16: pa.uint16(), UInt32: pa.uint32(), UInt64: pa.uint64(), Float32: pa.float32(), Float64: pa.float64(), Boolean: pa.bool_(), Utf8: pa.large_utf8(), Date: pa.date32(), Datetime: pa.timestamp("us"), Datetime("ms"): pa.timestamp("ms"), Datetime("us"): pa.timestamp("us"), Datetime("ns"): pa.timestamp("ns"), Duration: pa.duration("us"), Duration("ms"): pa.duration("ms"), Duration("us"): pa.duration("us"), Duration("ns"): pa.duration("ns"), Time: pa.time64("us"), # Time("ms"): pa.time32("ms"),
alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.builds(pa.timestamp,
class KmvSketchTest(parameterized.TestCase): @parameterized.named_parameters( ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()), ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()), ("string", ["a", "a", "b", "c", None], pa.string()), ("large_string", ["a", "a", "b", "c"], pa.large_string()), ("int8", [1, 1, 2, 3, None], pa.int8()), ("int16", [1, 1, 2, 3], pa.int16()), ("int32", [1, 1, 2, 3, None], pa.int32()), ("int64", [1, 1, 2, 3], pa.int64()), ("uint8", [1, 1, 2, 3], pa.uint8()), ("uint16", [1, None, 1, 2, 3], pa.uint16()), ("uint32", [1, 1, 2, 3], pa.uint32()), ("uint64", [1, 1, 2, 3, None], pa.uint64()), ) def test_add(self, values, type_): sketch = _create_basic_sketch(pa.array(values, type=type_)) num_unique = sketch.Estimate() self.assertEqual(3, num_unique) def test_add_unsupported_type(self): values = pa.array([True, False], pa.bool_()) sketch = sketches.KmvSketch(_NUM_BUCKETS) with self.assertRaisesRegex(RuntimeError, "UNIMPLEMENTED: bool"): sketch.AddValues(values) def test_merge(self): sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) sketch2 = _create_basic_sketch(pa.array(["d", "a"])) sketch1.Merge(sketch2) num_unique = sketch1.Estimate() self.assertEqual(4, num_unique) def test_merge_error(self): sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) sketch2 = _create_basic_sketch(pa.array(["d", "a"]), num_buckets=64) with self.assertRaisesRegex( Exception, "Both sketches must have the same number of buckets"): sketch1.Merge(sketch2) def test_picklable(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) pickled = pickle.dumps(sketch, 2) self.assertIsInstance(pickled, bytes) unpickled = pickle.loads(pickled) self.assertIsInstance(unpickled, sketches.KmvSketch) num_unique = unpickled.Estimate() self.assertEqual(3, num_unique) def test_serialization(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) serialized = sketch.Serialize() self.assertIsInstance(serialized, bytes) deserialized = sketches.KmvSketch.Deserialize(serialized) self.assertIsInstance(deserialized, sketches.KmvSketch) num_unique = deserialized.Estimate() self.assertEqual(3, num_unique) def test_deserialize_fails_with_exception(self): with self.assertRaisesRegex(RuntimeError, "Failed to parse Kmv sketch"): sketches.KmvSketch.Deserialize("I am no proto")
@pytest.mark.parametrize( ( 'data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count', 'num_values', 'distinct_count' ), [ ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0 ), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0 ), ( ['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import sys import pytest import weakref import numpy as np import pyarrow as pa tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64())] def test_tensor_attrs(): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) assert tensor.ndim == 2 assert tensor.dim_names == [] assert tensor.size == 40 assert tensor.shape == data.shape assert tensor.strides == data.strides assert tensor.is_contiguous
# The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'),
import pytest from pyarrow.compat import unittest, u # noqa import pyarrow as pa import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int64()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint64()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def test_iterable_types(): arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
import collections import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int64()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint64()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__()
import collections import datetime import decimal import itertools import math import traceback import sys import numpy as np import pytz import six int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() class MyInt: def __init__(self, value): self.value = value
class TestAbstractFileParserStatics: @pytest.mark.parametrize( # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html "input_json_type, output_pyarrow_type", [ ("string", pa.large_string()), ("number", pa.float64()), ("integer", pa.int64()), ("object", pa.large_string()), ("array", pa.large_string()), ("boolean", pa.bool_()), ("null", pa.large_string()), ], ) def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None: # Json -> PyArrow direction LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type @pytest.mark.parametrize( # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html "input_pyarrow_types, output_json_type", [ ((pa.null(),), "string"), # null type ((pa.bool_(),), "boolean"), # boolean type ( (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()), "integer", ), # integer types ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"), # number types ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"), # temporal types ((pa.binary(), pa.large_binary()), "string"), # binary types ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"), # string types ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"), # array types ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"), # object types ], ) def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None: # PyArrow -> Json direction (reverse=True) for typ in input_pyarrow_types: LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type @pytest.mark.parametrize( # if expecting fail, put pyarrow_schema as None "json_schema, pyarrow_schema", [ ( {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"}, { "a": pa.large_string(), "b": pa.float64(), "c": pa.int64(), "d": pa.large_string(), "e": pa.large_string(), "f": pa.bool_(), "g": pa.large_string(), }, ), ({"single_column": "object"}, {"single_column": pa.large_string()}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None: # Json -> PyArrow direction if pyarrow_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) LOGGER.debug(str(e_info)) @pytest.mark.parametrize( # if expecting fail, put json_schema as None "pyarrow_schema, json_schema", [ ( { "a": pa.utf8(), "b": pa.float16(), "c": pa.uint32(), "d": pa.map_(pa.string(), pa.float32()), "e": pa.bool_(), "f": pa.date64(), }, {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"}, ), ({"single_column": pa.int32()}, {"single_column": "integer"}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None: # PyArrow -> Json direction (reverse=True) if json_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) LOGGER.debug(str(e_info))
# The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'),
pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] values = pa.array(pyvalues, type=pa.binary()) result = pa.ListArray.from_arrays(offsets, values) expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) assert result.equals(expected) def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected
pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t)
def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8')) assert arr.equals(expected)
except ImportError: pd = None _MERGE_TEST_CASES = [ dict( testcase_name="empty_input", inputs=[], expected_output=dict(), ), dict( testcase_name="basic_types", inputs=[ { "bool": pa.array([False, None, True], type=pa.bool_()), "int64": pa.array([1, None, 3], type=pa.int64()), "uint64": pa.array([1, None, 3], type=pa.uint64()), "int32": pa.array([1, None, 3], type=pa.int32()), "uint32": pa.array([1, None, 3], type=pa.uint32()), "float": pa.array([1., None, 3.], type=pa.float32()), "double": pa.array([1., None, 3.], type=pa.float64()), "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()), "large_bytes": pa.array([b"abc", None, b"ghi"], type=pa.large_binary()), "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()), "large_unicode": pa.array([u"abc", None, u"ghi"], type=pa.large_utf8()), }, { "bool": pa.array([None, False], type=pa.bool_()), "int64": pa.array([None, 4], type=pa.int64()), "uint64": pa.array([None, 4], type=pa.uint64()),
import pyarrow as pa @pytest.mark.parametrize(['value', 'ty', 'klass', 'deprecated'], [ (False, None, pa.BooleanScalar, pa.BooleanValue), (True, None, pa.BooleanScalar, pa.BooleanValue), (1, None, pa.Int64Scalar, pa.Int64Value), (-1, None, pa.Int64Scalar, pa.Int64Value), (1, pa.int8(), pa.Int8Scalar, pa.Int8Value), (1, pa.uint8(), pa.UInt8Scalar, pa.UInt8Value), (1, pa.int16(), pa.Int16Scalar, pa.Int16Value), (1, pa.uint16(), pa.UInt16Scalar, pa.UInt16Value), (1, pa.int32(), pa.Int32Scalar, pa.Int32Value), (1, pa.uint32(), pa.UInt32Scalar, pa.UInt32Value), (1, pa.int64(), pa.Int64Scalar, pa.Int64Value), (1, pa.uint64(), pa.UInt64Scalar, pa.UInt64Value), (1.0, None, pa.DoubleScalar, pa.DoubleValue), (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue), (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue), (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value), (decimal.Decimal("1.1234567890123456789012345678901234567890"), None, pa.Decimal256Scalar, pa.Decimal256Value), ("string", None, pa.StringScalar, pa.StringValue), (b"bytes", None, pa.BinaryScalar, pa.BinaryValue), ("largestring", pa.large_string(), pa.LargeStringScalar, pa.LargeStringValue), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar, pa.LargeBinaryValue), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue), ([1, 2, 3], None, pa.ListScalar, pa.ListValue), ([1, 2, 3, 4], pa.large_list(
"BOOL", pyarrow.int8().id: "INT64", pyarrow.int16().id: "INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE",
bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from([ pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type)
import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def check_struct_type(ty, expected): """
np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ]
# not needed once tfx_bsl can be PY3 only. try: from pandas import DataFrame except ImportError as err: sys.stderr.write("Error importing pandas. Some tfx_bsl functionalities " "are not available. {}\n".format(err)) DataFrame = Any # pylint: enable=g-import-not-at-top # pytype: enable=import-error # pylint: enable=unused-import _EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], []) _NUMPY_KIND_TO_ARROW_TYPE = { "i": pa.int64(), "u": pa.uint64(), "f": pa.float64(), "b": pa.int8(), "S": pa.binary(), "O": pa.binary(), "U": pa.binary(), } def TotalByteSize(table_or_batch: Union[pa.Table, pa.RecordBatch], ignore_unsupported=False): """Returns the in-memory size of a record batch or a table.""" if isinstance(table_or_batch, pa.Table): return sum([ _TotalByteSize(b, ignore_unsupported) for b in table_or_batch.to_batches(max_chunksize=None)
import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int64()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint64()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__()
#!/usr/bin/env python import pyarrow as pa import os fields = [ ('uint8', pa.uint8()), ('uint16', pa.uint16()), ('uint32', pa.uint32()), ('uint64', pa.uint64()), ('int8', pa.int8()), ('int16', pa.int16()), ('int32', pa.int32()), ('int64', pa.int64()), ('float', pa.float32()), ('double', pa.float64()), ] schema = pa.schema(fields) uints = [1, 2, 4, 8] ints = [1, -2, 4, -8] floats = [1.1, -2.2, 4.4, -8.8] columns = [ pa.array(uints, type=pa.uint8()), pa.array(uints, type=pa.uint16()), pa.array(uints, type=pa.uint32()), pa.array(uints, type=pa.uint64()), pa.array(ints, type=pa.int8()), pa.array(ints, type=pa.int16()), pa.array(ints, type=pa.int32()),
def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) n = sys.getrefcount(tensor) array = tensor.to_numpy() # noqa assert sys.getrefcount(tensor) == n + 1 @pytest.mark.parametrize('dtype_str,arrow_type', [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64())]) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) tensor = pa.Tensor.from_numpy(data) assert tensor.type == arrow_type repr(tensor) result = tensor.to_numpy() assert (data == result).all()
storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])), index_storage_key=storage_key, dtype=dtype, ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 @pytest.mark.parametrize("dtype,expected", [(pa.int8(), pa.int64()), (pa.uint8(), pa.uint64()), (None, None)]) def test_index_normalize_dtype(dtype, expected): index = ExplicitSecondaryIndex( column="col", dtype=dtype, index_storage_key="dataset_uuid/some_index.parquet") assert index.dtype == expected def test_index_raises_nested_dtype(): with pytest.raises(NotImplementedError) as exc: ExplicitSecondaryIndex( column="col", dtype=pa.list_(pa.int8()), index_storage_key="dataset_uuid/some_index.parquet",
assert in_dict[field] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t) def test_type_id(): # enum values are not exposed publicly for ty in get_many_types(): assert isinstance(ty.id, int) def test_bit_width():
from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ExtensionDtype from ._algorithms import all_op, any_op, extract_isnull_bytemap _python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type, pa.string().id: six.text_type, # Use any list type here, only LIST is important pa.list_(pa.string()).id: list, } _string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}