def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), ('c', pa.string()), ('d', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [None, None], 'b': [u"Xxx", u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [u"#N/A"], 'b': [u""], }
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def field(jvm_field): """ Construct a Field from a org.apache.arrow.vector.types.pojo.Field instance. Parameters ---------- jvm_field: org.apache.arrow.vector.types.pojo.Field Returns ------- pyarrow.Field """ name = jvm_field.getName() jvm_type = jvm_field.getType() typ = None if not jvm_type.isComplex(): type_str = jvm_type.getTypeID().toString() if type_str == 'Null': typ = pa.null() elif type_str == 'Int': typ = _from_jvm_int_type(jvm_type) elif type_str == 'FloatingPoint': typ = _from_jvm_float_type(jvm_type) elif type_str == 'Utf8': typ = pa.string() elif type_str == 'Binary': typ = pa.binary() elif type_str == 'FixedSizeBinary': typ = pa.binary(jvm_type.getByteWidth()) elif type_str == 'Bool': typ = pa.bool_() elif type_str == 'Time': typ = _from_jvm_time_type(jvm_type) elif type_str == 'Timestamp': typ = _from_jvm_timestamp_type(jvm_type) elif type_str == 'Date': typ = _from_jvm_date_type(jvm_type) elif type_str == 'Decimal': typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale()) else: raise NotImplementedError( "Unsupported JVM type: {}".format(type_str)) else: # TODO: The following JVM types are not implemented: # Struct, List, FixedSizeList, Union, Dictionary raise NotImplementedError( "JVM field conversion only implemented for primitive types.") nullable = jvm_field.isNullable() if jvm_field.getMetadata().isEmpty(): metadata = None else: metadata = dict(jvm_field.getMetadata()) return pa.field(name, typ, nullable, metadata)
def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) arr = pa.array(data) expected = pa.array(list(data)) assert arr.equals(expected) assert arr.type == pa.list_(pa.null()) data2 = np.array([None, None, [None, None], np.array([None, None], dtype=object)], dtype=object) arr = pa.array(data2) expected = pa.array([None, None, [None, None], [None, None]]) assert arr.equals(expected)
def test_simple_nulls(self): # Infer various kinds of data, with nulls rows = (b'{"a": 1, "b": 2, "c": null, "d": null, "e": null}\n' b'{"a": null, "b": -5, "c": "foo", "d": null, "e": true}\n' b'{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}\n') table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.null()), ('e', pa.bool_())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, None, 4.5], 'b': [2, -5, None], 'c': [None, u"foo", u"nan"], 'd': [None, None, None], 'e': [None, True, False], }
def test_simple_nulls(self): # Infer various kinds of data, with nulls rows = (b"a,b,c,d,e\n" b"1,2,,,3\n" b"nan,-5,foo,,nan\n" b"4.5,#N/A,nan,,\xff\n") table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.null()), ('e', pa.binary())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, None, 4.5], 'b': [2, -5, None], 'c': [u"", u"foo", u"nan"], 'd': [None, None, None], 'e': [b"3", b"nan", b"\xff"], }
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_empty_iterable(): arr = pa.array(StrangeIterable([])) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pa.null() assert arr.to_pylist() == []
def test_empty_list(seq): arr = pa.array(seq([])) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pa.null() assert arr.to_pylist() == []
pojo_Field = jpype.JClass('org.apache.arrow.vector.types.pojo.Field') return om.readValue(jvm_spec, pojo_Field) # In the following, we use the JSON serialization of the Field objects in Java. # This ensures that we neither rely on the exact mechanics on how to construct # them using Java code as well as enables us to define them as parameters # without to invoke the JVM. # # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('typ,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
def test_is_null(): assert types.is_null(pa.null()) assert not types.is_null(pa.list_(pa.int32()))
def test_all_none(self): arr = pyarrow.from_pylist([None, None]) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == pyarrow.null() assert arr.to_pylist() == [None, None]
def test_sequence_all_none(seq): arr = pa.array(seq([None, None])) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == pa.null() assert arr.to_pylist() == [None, None]
def test_all_none(self): arr = pa.array([None, None]) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == pa.null() assert arr.to_pylist() == [None, None]
np.arange(10, dtype=np.float16), ] ) def test_to_numpy_roundtrip(narr): arr = pa.array(narr) assert narr.dtype == arr.to_numpy().dtype np.testing.assert_array_equal(narr, arr.to_numpy()) np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
import pyarrow as pa import six from pandas.api.types import ( is_array_like, is_bool_dtype, is_int64_dtype, is_integer, is_integer_dtype, ) from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ExtensionDtype from ._algorithms import all_op, any_op, extract_isnull_bytemap _python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime,
def test_is_boolean_value(): assert not pa.types.is_boolean_value(1) assert pa.types.is_boolean_value(True) assert pa.types.is_boolean_value(False) assert pa.types.is_boolean_value(np.bool_(True)) assert pa.types.is_boolean_value(np.bool_(False)) @h.given( past.all_types | past.all_fields | past.all_schemas ) @h.example( pa.field(name='', type=pa.null(), metadata={'0': '', '': ''}) ) def test_pickling(field): data = pickle.dumps(field) assert pickle.loads(data) == field @h.given( st.lists(past.all_types) | st.lists(past.all_fields) | st.lists(past.all_schemas) ) def test_hashing(items): h.assume( # well, this is still O(n^2), but makes the input unique all(not a.equals(b) for i, a in enumerate(items) for b in items[:i])
pa.array([[3]], type=pa.list_(pa.int32())) ], ["f1", "f2"]) }], expected_output={ "struct<binary, list<int32>>": pa.StructArray.from_arrays([ pa.array([b"abc", None, b"def", b"ghi"]), pa.array([[None], [1, 2], [], [3]], type=pa.list_(pa.int32())) ], ["f1", "f2"]) }), dict( testcase_name="missing_or_null_column_fixed_width", inputs=[ { "int32": pa.array([None, None], type=pa.null()) }, { "int64": pa.array([None, None], type=pa.null()) }, { "int64": pa.array([123], type=pa.int64()) }, { "int32": pa.array([456], type=pa.int32()) }, ], expected_output={ "int32": pa.array([None, None, None, None, None, 456], type=pa.int32()), "int64":
([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
import hypothesis.extra.numpy as npst import hypothesis.extra.pytz as tzst import numpy as np import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters( min_codepoint=0x41, max_codepoint=0x7E ) ) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from([ pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(), pa.uint16(), pa.uint32(),
expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected) def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_conversions_no_sentinel_values():
] for case in safe_cases: _check_cast_case(case) def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
), pa.array([0.1, 0.2], type=pa.float32()), ], names=["b", "time", "a"], ), ), ( pd.DataFrame({ "nanfloat": [None, 1.0], "nans": [pd.NA, pd.NA], "str": ["a", "b"] }), pa.Table.from_arrays( [ pa.array([None, 1.0], type=pa.float32()), pa.array([None, None], type=pa.null()), pa.array(["a", "b"], type=pa.string()), ], names=["nanfloat", "nans", "str"], ), ), httpfail( pd.DataFrame({ "nanint": [pd.NA, 3], # arrow doesn't like this }), None, ), httpfail( pd.DataFrame({ "nanstr": [pd.NA, "string"], }),
def test_empty_range(): arr = pa.array(range(0)) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pa.null() assert arr.to_pylist() == []
_DECODE_CASES = [ dict(testcase_name="without_schema_first_example_typed", schema_text_proto=None, sequence_examples_text_proto=[ _TYPED_SEQUENCE_EXAMPLE, _UNTYPED_SEQUENCE_EXAMPLE, _SOME_FEATURES_TYPED_SEQUENCE_EXAMPLE, _EMPTY_VALUES_LIST_SEQUENCE_EXAMPLE ], expected=pa.RecordBatch.from_arrays([ pa.array([[1], None, None, []], type=pa.large_list(pa.int64())), pa.array([[1.0, 2.0], None, None, []], type=pa.large_list(pa.float32())), pa.array([[b"a", b"b", b"c"], None, None, []], type=pa.large_list(pa.large_binary())), pa.array([None, None, None, None], pa.null()), pa.array([None, None, [1.0], None], type=pa.large_list(pa.float32())), pa.StructArray.from_arrays([ pa.array([None, None, [[1.0]], None], type=pa.large_list(pa.large_list(pa.float32()))), pa.array([[[1, 2], [3]], [], [None, None, None], [[], []]], type=pa.large_list(pa.large_list(pa.int64()))), pa.array([[[3.0, 4.0], [1.0, 2.0]], [], [None], [[]]], type=pa.large_list(pa.large_list(pa.float32()))), pa.array([[[b"a", b"b"], [b"c"]], [], [None], [[]]], type=pa.large_list(pa.large_list(pa.large_binary()))) ], names=[ "sequence_v", "sequence_x", "sequence_y", "sequence_z"
def test_convert_options(): cls = ConvertOptions opts = cls() check_options_class(cls, check_utf8=[True, False], strings_can_be_null=[False, True], include_columns=[[], ['def', 'abc']], include_missing_columns=[False, True], auto_dict_encode=[False, True], timestamp_parsers=[[], [ISO8601, '%y-%m']]) assert opts.auto_dict_max_cardinality > 0 opts.auto_dict_max_cardinality = 99999 assert opts.auto_dict_max_cardinality == 99999 assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] assert opts.timestamp_parsers == [] opts.timestamp_parsers = [ISO8601] assert opts.timestamp_parsers == [ISO8601] opts = cls(column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], auto_dict_max_cardinality=999, timestamp_parsers=[ISO8601, '%Y-%m-%d']) assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.auto_dict_max_cardinality == 999 assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
def test_null_field_may_not_be_non_nullable(): # ARROW-7273 with pytest.raises(ValueError): pa.field('f0', pa.null(), nullable=False)
def test_header(self): rows = b"abc,def,gh\n" reader = self.open_bytes(rows) expected_schema = pa.schema([('abc', pa.null()), ('def', pa.null()), ('gh', pa.null())]) self.check_reader(reader, expected_schema, [])
def test_column_options(self): # With column_names rows = b"1,2,3\n4,5,6" read_options = ReadOptions() read_options.column_names = ['d', 'e', 'f'] reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()), ('f', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'd': [1, 4], 'e': [2, 5], 'f': [3, 6] }]) # With include_columns convert_options = ConvertOptions() convert_options.include_columns = ['f', 'e'] reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'e': [2, 5], 'f': [3, 6] }]) # With column_types convert_options.column_types = {'e': pa.string()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'e': ["2", "5"], 'f': [3, 6] }]) # Missing columns in include_columns convert_options.include_columns = ['g', 'f', 'e'] with pytest.raises( KeyError, match="Column 'g' in include_columns does not exist"): reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) convert_options.include_missing_columns = True reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }]) convert_options.column_types = {'e': pa.string(), 'g': pa.float64()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }])
def test_lift_slice_aware(self): examples = [ ('slice1', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice2', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['a']]), pa.array([['cat'], ['dog'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice1', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice2', pa.Table.from_arrays([ pa.array([None, None, None, None], type=pa.null()), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [ ('slice1', text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "string_y" } categorical_cross_stats { lift { lift_series { y_string: "cat" y_count: 4 lift_values { x_string: "b" lift: 2.0 x_count: 2 x_and_y_count: 2 } lift_values { x_string: "a" lift: 0.6666667 x_count: 6 x_and_y_count: 2 } } lift_series { y_string: "dog" y_count: 4 lift_values { x_string: "a" lift: 1.3333333 x_count: 6 x_and_y_count: 4 } lift_values { x_string: "b" lift: 0.0 x_count: 2 x_and_y_count: 0 } } } } }""", statistics_pb2.DatasetFeatureStatistics())), ('slice2', text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "string_y" } categorical_cross_stats { lift { lift_series { y_string: "cat" y_count: 3 lift_values { x_string: "a" lift: 0.7777778 x_count: 3 x_and_y_count: 1 } } lift_series { y_string: "dog" y_count: 4 lift_values { x_string: "a" lift: 1.1666667 x_count: 3 x_and_y_count: 2 } } } } }""", statistics_pb2.DatasetFeatureStatistics())), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result)
secondary_delimiter='|', expected_result=[ pa.RecordBatch.from_arrays([ pa.array([[1, 2.3]], pa.list_(pa.float32())), pa.array([[b'test']], pa.list_(pa.binary())) ], ['multivalent_feature', 'test_feature']) ]), dict( testcase_name='empty_multivalent_column', input_lines=['|,test'], column_names=['empty_feature', 'test_feature'], multivalent_columns_names=['empty_feature'], secondary_delimiter='|', expected_result=[ pa.RecordBatch.from_arrays([ pa.array([None], pa.null()), pa.array([[b'test']], pa.list_(pa.binary())) ], ['empty_feature', 'test_feature']) ]), dict( testcase_name='empty_string_multivalent_column', input_lines=['|,test', 'a|b,test'], column_names=['string_feature', 'test_feature'], multivalent_columns_names=['string_feature'], secondary_delimiter='|', expected_result=[ pa.RecordBatch.from_arrays([ pa.array([[b'', b''], [b'a', b'b']], pa.list_(pa.binary())), pa.array([[b'test'], [b'test']], pa.list_(pa.binary())) ], ['string_feature', 'test_feature']) ]),
[pa.array([None, [1]], pa.list_(pa.int64()))], ['f1'])), dict( testcase_name='empty_csv', input_lines=[], column_names=['f1'], expected_csv_cells=[], expected_types=[csv_decoder.ColumnType.UNKNOWN], expected_record_batch=[], ), dict(testcase_name='null_column', input_lines=['', ''], column_names=['f1'], expected_csv_cells=[[], []], expected_types=[csv_decoder.ColumnType.UNKNOWN], expected_record_batch=pa.RecordBatch.from_arrays( [pa.array([None, None], pa.null())], ['f1'])), dict(testcase_name='size_2_vector_int_multivalent', input_lines=['12|14'], column_names=['x'], expected_csv_cells=[[b'12|14']], expected_types=[csv_decoder.ColumnType.INT], expected_record_batch=pa.RecordBatch.from_arrays( [pa.array([[12, 14]], pa.list_(pa.int64()))], ['x']), delimiter=' ', multivalent_columns=['x'], secondary_delimiter='|'), dict(testcase_name='space_and_comma_delimiter', input_lines=['1,2 "abcdef"', '5,1 "wxxyyz"'], column_names=['f1', 'f2'], expected_csv_cells=[[b'1,2', b'abcdef'], [b'5,1', b'wxxyyz']], expected_types=[
else: return schema_cls(fields) # In the following, we use the JSON serialization of the Field objects in Java. # This ensures that we neither rely on the exact mechanics on how to construct # them using Java code as well as enables us to define them as parameters # without to invoke the JVM. # # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
def _determine_schemas_to_compare(schemas, ignore_pandas): """ Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference and determining all null columns. .. note:: If pandas metadata exists, the version stored in the metadata is overwritten with the currently installed version since we expect to stay backwards compatible Returns ------- reference: Schema A reference schema which is picked from the input list. The reference schema is guaranteed to be a schema having the least number of null columns of all input columns. The set of null columns is guaranteed to be a true subset of all null columns of all input schemas. If no such schema can be found, an Exception is raised list_of_schemas: List[Tuple[Schema, List]] A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and must be removed before comparing the schemas """ has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas schemas_to_evaluate = [] reference = None null_cols_in_reference = set() for schema in schemas: if not isinstance(schema, SchemaWrapper): schema = SchemaWrapper(schema, "__unknown__") if has_pandas: metadata = schema.metadata if metadata is None or b"pandas" not in metadata: raise ValueError( "Pandas and non-Pandas schemas are not comparable. " "Use ignore_pandas=True if you only want to compare " "on Arrow level.") pandas_metadata = load_json(metadata[b"pandas"].decode("utf8")) # we don't care about the pandas version, since we assume it's safe # to read datasets that were written by older or newer versions. pandas_metadata["pandas_version"] = "{}".format(pd.__version__) metadata_clean = deepcopy(metadata) metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata) current = SchemaWrapper(pa.schema(schema, metadata_clean), schema.origin) else: current = schema # If a field is null we cannot compare it and must therefore reject it null_columns = { field.name for field in current if field.type == pa.null() } # Determine a valid reference schema. A valid reference schema is considered to be the schema # of all input schemas with the least empty columns. # The reference schema ought to be a schema whose empty columns are a true subset for all sets # of empty columns. This ensures that the actual reference schema is the schema with the most # information possible. A schema which doesn't fulfil this requirement would weaken the # comparison and would allow for false positives # Trivial case if reference is None: reference = current null_cols_in_reference = null_columns # The reference has enough information to validate against current schema. # Append it to the list of schemas to be verified elif null_cols_in_reference.issubset(null_columns): schemas_to_evaluate.append((current, null_columns)) # current schema includes all information of reference and more. # Add reference to schemas_to_evaluate and update reference elif null_columns.issubset(null_cols_in_reference): schemas_to_evaluate.append((reference, null_cols_in_reference)) reference = current null_cols_in_reference = null_columns # If there is no clear subset available elect the schema with the least null columns as `reference`. # Iterate over the null columns of `reference` and replace it with a non-null field of the `current` # schema which recovers the loop invariant (null columns of `reference` is subset of `current`) else: if len(null_columns) < len(null_cols_in_reference): reference, current = current, reference null_cols_in_reference, null_columns = ( null_columns, null_cols_in_reference, ) for col in null_cols_in_reference - null_columns: # Enrich the information in the reference by grabbing the missing fields # from the current iteration. This assumes that we only check for global validity and # isn't relevant where the reference comes from. reference = _swap_fields_by_name(reference, current, col) null_cols_in_reference.remove(col) schemas_to_evaluate.append((current, null_columns)) assert (reference is not None) or (not schemas_to_evaluate) return reference, schemas_to_evaluate
def test_empty_list(self): arr = pyarrow.from_pylist([]) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pyarrow.null()
def test_topk_uniques_combiner_with_categorical_feature(self): # fa: 4 12, 2 23, 2 34, 2 45 batches = [ pa.RecordBatch.from_arrays( [pa.array([[12, 23, 34, 12], [45, 23]])], ['fa']), pa.RecordBatch.from_arrays([pa.array([[12, 12, 34, 45]])], ['fa']), pa.RecordBatch.from_arrays( [pa.array([None, None, None, None], type=pa.null())], ['fa']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } type: INT string_stats { unique: 4 top_values { value: '12' frequency: 4 } top_values { value: '45' frequency: 2 } top_values { value: '34' frequency: 2 } top_values { value: '23' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "12" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "45" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "34" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = (top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( schema=schema, num_top_values=4, num_rank_histogram_buckets=3)) self.assertCombinerOutputEqual(batches, generator, expected_result)
dict( testcase_name="without_schema_first_example_typed", schema_text_proto=None, sequence_examples_text_proto=[ _TYPED_SEQUENCE_EXAMPLE, _UNTYPED_SEQUENCE_EXAMPLE, _SOME_FEATURES_TYPED_SEQUENCE_EXAMPLE, _EMPTY_VALUES_LIST_SEQUENCE_EXAMPLE ], create_expected=lambda list_factory, binary_type: pa.RecordBatch. from_arrays([ pa.array([[1], None, None, []], type=list_factory(pa.int64())), pa.array([[1.0, 2.0], None, None, []], type=list_factory(pa.float32())), pa.array([[b"a", b"b", b"c"], None, None, []], type=list_factory(binary_type)), pa.array([None, None, None, None], pa.null()), pa.array([None, None, [1.0], None], type=list_factory(pa.float32())), pa.StructArray.from_arrays( [ pa.array([None, None, [[1.0]], None], type=list_factory(list_factory(pa.float32()))), pa.array([[[1, 2], [3]], [], [None, None, None], [[], []]], type=list_factory(list_factory(pa.int64()))), pa.array([[[3.0, 4.0], [1.0, 2.0]], [], [None], [[]]], type=list_factory(list_factory(pa.float32()))), pa.array([[[b"a", b"b"], [b"c"]], [], [None], [[]]], type=list_factory(list_factory(binary_type))) ], names=["sequence_v", "sequence_x", "sequence_y", "sequence_z"]) ], [
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import pickle import pytest import pyarrow as pa import pyarrow.types as types MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10),
([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range():
old_allocation = pa.total_allocated_bytes() try: yield finally: assert pa.total_allocated_bytes() == old_allocation @pytest.fixture(autouse=True) def assert_pyarrow_leak(): # automatically applied to all test cases with no_pyarrow_leak(): yield _supported_pyarrow_types = [ pa.null(), pa.bool_(), pa.int32(), pa.time32("s"), pa.time64("us"), pa.date32(), pa.timestamp("us"), pa.timestamp("us", tz="UTC"), pa.timestamp("us", tz="Europe/Paris"), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.large_string(),
def test_empty_list(self): arr = pyarrow.from_pylist([]) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pyarrow.null() assert arr.to_pylist() == []
import hypothesis.extra.numpy as npst import hypothesis.extra.pytz as tzst import numpy as np import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters( min_codepoint=0x41, max_codepoint=0x7E ) ) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from([ pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(),
def test_all_none(self): arr = pyarrow.from_pylist([None, None]) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == pyarrow.null()