def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.large_list(pa.uint8()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_union_type(): def check_fields(ty, fields): assert ty.num_fields == len(fields) assert [ty[i] for i in range(ty.num_fields)] == fields fields = [pa.field('x', pa.list_(pa.int32())), pa.field('y', pa.binary())] type_codes = [5, 9] sparse_factories = [ partial(pa.union, mode='sparse'), partial(pa.union, mode=pa.lib.UnionMode_SPARSE), pa.sparse_union, ] dense_factories = [ partial(pa.union, mode='dense'), partial(pa.union, mode=pa.lib.UnionMode_DENSE), pa.dense_union, ] for factory in sparse_factories: ty = factory(fields) assert isinstance(ty, pa.SparseUnionType) assert ty.mode == 'sparse' check_fields(ty, fields) assert ty.type_codes == [0, 1] ty = factory(fields, type_codes=type_codes) assert ty.mode == 'sparse' check_fields(ty, fields) assert ty.type_codes == type_codes # Invalid number of type codes with pytest.raises(ValueError): factory(fields, type_codes=type_codes[1:]) for factory in dense_factories: ty = factory(fields) assert isinstance(ty, pa.DenseUnionType) assert ty.mode == 'dense' check_fields(ty, fields) assert ty.type_codes == [0, 1] ty = factory(fields, type_codes=type_codes) assert ty.mode == 'dense' check_fields(ty, fields) assert ty.type_codes == type_codes # Invalid number of type codes with pytest.raises(ValueError): factory(fields, type_codes=type_codes[1:]) for mode in ('unknown', 2): with pytest.raises(ValueError, match='Invalid union mode'): pa.union(fields, mode=mode)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.map_(pa.string(), pa.int8()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal128(12, 2), pa.decimal256(76, 38), pa.field('a', 'string', metadata={b'foo': b'bar'}), pa.list_(pa.field("element", pa.int64())), pa.large_list(pa.field("element", pa.int64())), pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int8())) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def arrow_type(field): if field.name == "decimal": return pa.decimal128(field.precision) elif field.name == "uniontype": return pa.union(field.cont_types) elif field.name == "array": return pa.list_(field.type) elif field.name == "map": return pa.map_(field.key, field.value) elif field.name == "struct": return pa.struct(field.fields) else: types = { "boolean": pa.bool_(), "tinyint": pa.int8(), "smallint": pa.int16(), "int": pa.int32(), "bigint": pa.int64(), "float": pa.float32(), "double": pa.float64(), "string": pa.string(), "char": pa.string(), "varchar": pa.string(), "binary": pa.binary(), "timestamp": pa.timestamp("ms"), "date": pa.date32(), } if field.name not in types: raise ValueError("Cannot convert to arrow type: " + field.name) return types[field.name]
def arrow_type(field): if field.name == 'decimal': return pa.decimal128(field.precision) elif field.name == 'uniontype': return pa.union(field.cont_types) elif field.name == 'array': return pa.list_(field.type) elif field.name == 'map': return pa.map_(field.key, field.value) elif field.name == 'struct': return pa.struct(field.fields) else: types = { 'boolean': pa.bool_(), 'tinyint': pa.int8(), 'smallint': pa.int16(), 'int': pa.int32(), 'bigint': pa.int64(), 'float': pa.float32(), 'double': pa.float64(), 'string': pa.string(), 'char': pa.string(), 'varchar': pa.string(), 'binary': pa.binary(), 'timestamp': pa.timestamp('ms'), 'date': pa.date32(), } if field.name not in types: raise ValueError('Cannot convert to arrow type: ' + field.name) return types[field.name]
def test_is_union(): assert types.is_union( pa.union([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ], pa.lib.UnionMode_SPARSE)) assert not types.is_union(pa.list_(pa.int32()))
def test_union_type(): def check_fields(ty, fields): assert ty.num_children == len(fields) assert [ty[i] for i in range(ty.num_children)] == fields fields = [pa.field('x', pa.list_(pa.int32())), pa.field('y', pa.binary())] for mode in ('sparse', pa.lib.UnionMode_SPARSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'sparse' check_fields(ty, fields) for mode in ('dense', pa.lib.UnionMode_DENSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'dense' check_fields(ty, fields) for mode in ('unknown', 2): with pytest.raises(ValueError, match='Invalid union mode'): pa.union(fields, mode=mode)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal128(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_union_type(): def check_fields(ty, fields): assert ty.num_fields == len(fields) assert [ty[i] for i in range(ty.num_fields)] == fields fields = [pa.field('x', pa.list_(pa.int32())), pa.field('y', pa.binary())] type_codes = [5, 9] for mode in ('sparse', pa.lib.UnionMode_SPARSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'sparse' check_fields(ty, fields) assert ty.type_codes == [0, 1] ty = pa.union(fields, mode=mode, type_codes=type_codes) assert ty.mode == 'sparse' check_fields(ty, fields) assert ty.type_codes == type_codes # Invalid number of type codes with pytest.raises(ValueError): pa.union(fields, mode=mode, type_codes=type_codes[1:]) for mode in ('dense', pa.lib.UnionMode_DENSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'dense' check_fields(ty, fields) assert ty.type_codes == [0, 1] ty = pa.union(fields, mode=mode, type_codes=type_codes) assert ty.mode == 'dense' check_fields(ty, fields) assert ty.type_codes == type_codes # Invalid number of type codes with pytest.raises(ValueError): pa.union(fields, mode=mode, type_codes=type_codes[1:]) for mode in ('unknown', 2): with pytest.raises(ValueError, match='Invalid union mode'): pa.union(fields, mode=mode)
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_is_union(): assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], pa.lib.UnionMode_SPARSE)) assert not types.is_union(pa.list_(pa.int32()))
pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ] def test_is_boolean(): assert types.is_boolean(pa.bool_()) assert not types.is_boolean(pa.int8()) def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
pa.struct( [ pa.field("a", pa.int32(), nullable=False), pa.field("b", pa.int8(), nullable=False), pa.field("c", pa.string()), ] ), pa.dictionary(pa.int8(), pa.string()), ] _unsupported_pyarrow_types = [ pa.decimal256(76, 38), pa.duration("s"), pa.map_(pa.string(), pa.int32()), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, type_codes=[4, 8], ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_SPARSE, ), pa.union( [ pa.field("a", pa.binary(10), nullable=False), pa.field("b", pa.string()), ],
pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ] def test_is_boolean(): assert types.is_boolean(pa.bool_()) assert not types.is_boolean(pa.int8()) def test_is_integer():