def test_fill_null(): arr = pa.array([1, 2, None, 4], type=pa.int8()) fill_value = pa.array([5], type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): arr.fill_null(fill_value) arr = pa.array([None, None, None, None], type=pa.null()) fill_value = pa.scalar(None, type=pa.null()) result = arr.fill_null(fill_value) expected = pa.array([None, None, None, None]) assert result.equals(expected) arr = pa.array(['a', 'bb', None]) result = arr.fill_null('ccc') expected = pa.array(['a', 'bb', 'ccc']) assert result.equals(expected) arr = pa.array([b'a', b'bb', None], type=pa.large_binary()) result = arr.fill_null('ccc') expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary()) assert result.equals(expected) arr = pa.array(['a', 'bb', None]) result = arr.fill_null(None) expected = pa.array(['a', 'bb', None]) assert result.equals(expected)
def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 2) expected_schema = _EXPECTED_ARROW_SCHEMA if raw_record_column_name is not None: expected_schema = pa.schema( list(expected_schema) + [pa.field(raw_record_column_name, pa.large_list(pa.large_binary()))]) self.assertTrue( record_batch.schema.equals(expected_schema), "Expected: {} ; got {}".format(expected_schema, record_batch.schema)) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue self.assertTrue( record_batch.column(i).equals(_EXPECTED_COLUMN_VALUES[field.name]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), _EXPECTED_COLUMN_VALUES[field.name])) if raw_record_column_name is not None: self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _RAW_RECORDS)
def testConvertToRecordBatchPassthroughData(self): passthrough_key1 = '__passthrough_with_batch_length__' passthrough_key2 = '__passthrough_with_one_value__' passthrough_key3 = '__passthrough_with_one_distinct_value_none__' passthrough_key4 = '__passthrough_with_one_distinct_value_not_none__' batch_dict = { 'a': np.array([100, 1, 10], np.int64), passthrough_key1: pa.array([[1], None, [0]], pa.large_list(pa.int64())), passthrough_key2: pa.array([None], pa.large_list(pa.float32())), passthrough_key3: pa.array([None, None], pa.large_list(pa.large_binary())), passthrough_key4: pa.array([[10], [10]], pa.large_list(pa.int64())) } schema = schema_utils.schema_from_feature_spec( {'a': tf.io.FixedLenFeature([], tf.int64)}) converter = impl_helper.make_tensor_to_arrow_converter(schema) passthrough_keys = { passthrough_key1, passthrough_key2, passthrough_key3, passthrough_key4 } arrow_schema = pa.schema([ ('a', pa.large_list(pa.int64())), (passthrough_key1, batch_dict[passthrough_key1].type), (passthrough_key2, batch_dict[passthrough_key2].type), (passthrough_key3, batch_dict[passthrough_key3].type), (passthrough_key4, batch_dict[passthrough_key4].type) ]) # Note that we only need `input_metadata.arrow_schema`. input_metadata = TensorAdapterConfig(arrow_schema, {}) record_batch, unary_features = impl._convert_to_record_batch( batch_dict, schema, converter, passthrough_keys, input_metadata) expected_record_batch = { 'a': [[100], [1], [10]], passthrough_key1: [[1], None, [0]] } self.assertDictEqual(expected_record_batch, record_batch.to_pydict()) expected_unary_features = { passthrough_key2: [None], passthrough_key3: [None], passthrough_key4: [[10]] } unary_features = {k: v.to_pylist() for k, v in unary_features.items()} self.assertDictEqual(expected_unary_features, unary_features) # Test pass-through data when input and output batch sizes are different and # the number of its unique values is >1. passthrough_key5 = '__passthrough_with_wrong_batch_size__' passthrough_keys.add(passthrough_key5) batch_dict[passthrough_key5] = pa.array([[1], [2]], pa.large_list(pa.int64())) input_metadata.arrow_schema = input_metadata.arrow_schema.append( pa.field(passthrough_key5, batch_dict[passthrough_key5].type)) with self.assertRaisesRegexp( ValueError, 'Cannot pass-through data when ' 'input and output batch sizes are different'): _ = impl._convert_to_record_batch(batch_dict, schema, converter, passthrough_keys, input_metadata)
def _make_record_batch(num_cols, num_rows): columns = [ pa.array([[b"kk"]] * num_rows, type=pa.large_list(pa.large_binary())) for _ in range(num_cols) ] column_names = ["col%d" % c for c in range(num_cols)] return pa.record_batch(columns, column_names)
def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue if field.name == _SEQUENCE_COLUMN_NAME: self.assertTrue(pa.types.is_struct(field.type)) for seq_column, seq_field in zip( record_batch.column(i).flatten(), list(field.type)): expected_array = _EXPECTED_COLUMN_VALUES[path.ColumnPath( [_SEQUENCE_COLUMN_NAME, seq_field.name])] self.assertTrue( seq_column.equals(expected_array), "Sequence column {} did not match ({} vs {})".format( seq_field.name, seq_column, expected_array)) continue self.assertTrue( record_batch.column(i).equals( _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])])) if raw_record_column_name is not None: self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue(record_batch.columns[-1].type.equals( pa.large_list(pa.large_binary()))) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
def _LargeBinaryCanBeDictEncoded() -> bool: """Returns True if a large binary array can be dictionary encoded.""" try: pa.array([], type=pa.large_binary()).dictionary_encode() except: # pylint:disable=bare-except return False return True
def _get_binary_like_byte_size_test_cases(): result = [] for array_type, sizeof_offsets in [ (pa.binary(), 4), (pa.string(), 4), (pa.large_binary(), 8), (pa.large_string(), 8), ]: result.append( dict( testcase_name=str(array_type), array=pa.array([ "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg", "hhhhhhhh", "iiiiiiiii" ], type=array_type), slice_offset=1, slice_length=3, # contents: 45 # offsets: 10 * sizeof_offsets # null bitmap: 2 expected_size=(45 + sizeof_offsets * 10 + 2), # contents: 9 # offsets: 4 * sizeof_offsets # null bitmap: 1 expected_sliced_size=(9 + sizeof_offsets * 4 + 1))) return result
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.large_list(pa.uint16()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()))
def test_large_binary(): data = [b'foo', b'bar'] * 50 for type in [pa.large_binary(), pa.large_string()]: arr = pa.array(data, type=type) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: _check_roundtrip(table, use_dictionary=use_dictionary)
def test_decode_large_types(self, schema_text_proto, examples_text_proto, create_expected): serialized_examples = [ text_format.Parse(pbtxt, tf.train.Example()).SerializeToString() for pbtxt in examples_text_proto ] serialized_schema = None if schema_text_proto is not None: serialized_schema = text_format.Parse( schema_text_proto, schema_pb2.Schema()).SerializeToString() if serialized_schema: coder = example_coder.ExamplesToRecordBatchDecoder( serialized_schema=serialized_schema, use_large_types=True) else: coder = example_coder.ExamplesToRecordBatchDecoder( use_large_types=True) result = coder.DecodeBatch(serialized_examples) self.assertIsInstance(result, pa.RecordBatch) expected = create_expected(pa.large_list, pa.large_binary()) self.assertTrue(result.equals(expected), "actual: {}\n expected:{}".format(result, expected)) if serialized_schema: self.assertTrue(expected.schema.equals(coder.ArrowSchema()))
def _LargeBinaryCanBeValueCounted() -> bool: """Returns True if a large binary array can be value counted.""" try: array_util.ValueCounts(pa.array([], type=pa.large_binary())) except: # pylint:disable=bare-except return False return True
def test_type_ids(): # Having this fixed is very important because internally we rely on this id # to parse from python for idx, arrow_type in [ (0, pa.null()), (1, pa.bool_()), (2, pa.uint8()), (3, pa.int8()), (4, pa.uint16()), (5, pa.int16()), (6, pa.uint32()), (7, pa.int32()), (8, pa.uint64()), (9, pa.int64()), (10, pa.float16()), (11, pa.float32()), (12, pa.float64()), (13, pa.string()), (13, pa.utf8()), (14, pa.binary()), (16, pa.date32()), (17, pa.date64()), (18, pa.timestamp("us")), (19, pa.time32("s")), (20, pa.time64("us")), (23, pa.decimal128(8, 1)), (34, pa.large_utf8()), (35, pa.large_binary()), ]: assert idx == arrow_type.id
def testIsBinaryLike(self): for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()): self.assertTrue(arrow_util.is_binary_like(t)) for t in (pa.list_(pa.binary()), pa.large_list(pa.string())): self.assertFalse(arrow_util.is_binary_like(t))
def _test_decode(self, schema_text_proto, sequence_examples_text_proto, create_expected, use_large_types): serialized_sequence_examples = [ text_format.Parse(pbtxt, tf.train.SequenceExample()).SerializeToString() for pbtxt in sequence_examples_text_proto ] serialized_schema = None if schema_text_proto is not None: serialized_schema = text_format.Parse( schema_text_proto, schema_pb2.Schema()).SerializeToString() if serialized_schema: coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder( _TEST_SEQUENCE_COLUMN_NAME, serialized_schema, use_large_types=use_large_types) else: coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder( _TEST_SEQUENCE_COLUMN_NAME, use_large_types=use_large_types) result = coder.DecodeBatch(serialized_sequence_examples) self.assertIsInstance(result, pa.RecordBatch) if use_large_types: expected = create_expected(pa.large_list, pa.large_binary()) else: expected = create_expected(pa.list_, pa.binary()) self.assertTrue(result.equals(expected), "actual: {}\n expected:{}".format(result, expected)) if serialized_schema is not None: self.assertTrue(coder.ArrowSchema().equals(result.schema))
def _ValidateRecordBatch(self, tfxio, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) expected_column_values = GetExpectedColumnValues(tfxio) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue self.assertTrue( record_batch.column(i).equals( expected_column_values[field.name]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), expected_column_values[field.name])) if raw_record_column_name is not None: if tfxio._can_produce_large_types: raw_record_column_type = pa.large_list(pa.large_binary()) else: raw_record_column_type = pa.list_(pa.binary()) self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue( record_batch.columns[-1].type.equals(raw_record_column_type)) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
def test_large_binary(self): array = pyarrow.array( [b"1111", b"2222", b"3333"], pyarrow.large_binary(), numpy.array([False, True, False]), ) self._test_data(array)
def testRecordBatchAndTensorAdapter(self): column_name = "raw_record" telemetry_descriptors = ["some", "component"] tfxio = raw_tf_record.RawTfRecordTFXIO( self._raw_record_file, column_name, telemetry_descriptors=telemetry_descriptors) expected_type = (pa.large_list(pa.large_binary()) if _ProducesLargeTypes(tfxio) else pa.list_(pa.binary())) got_schema = tfxio.ArrowSchema() self.assertTrue( got_schema.equals(pa.schema([pa.field(column_name, expected_type)])), "got: {}".format(got_schema)) def _AssertFn(record_batches): self.assertLen(record_batches, 1) record_batch = record_batches[0] self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) self.assertTrue(record_batch.columns[0].equals( pa.array([[r] for r in _RAW_RECORDS], type=expected_type))) tensor_adapter = tfxio.TensorAdapter() tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(tensors, 1) self.assertIn(column_name, tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, telemetry_descriptors, "bytes", "tfrecords_gzip")
def _tf_dtype_to_arrow_type(dtype: tf.DType): """Maps a tf Dtype to an Arrow type.""" if dtype == tf.string: return pa.large_binary() elif dtype == tf.bool: raise TypeError("Unable to handle bool tensors -- consider casting it to a " "tf.uint8") return pa.from_numpy_dtype(dtype.as_numpy_dtype)
def _GetConvertToBinaryFn( array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]: """Returns a function that converts a StringArray to BinaryArray.""" if pa.types.is_string(array_type): return lambda array: array.view(pa.binary()) if pa.types.is_large_string(array_type): return lambda array: array.view(pa.large_binary()) return None
def CreateRawRecordColumn(raw_records: List[bytes], produce_large_types: bool) -> pa.Array: """Returns an Array that satisfies the requirement of a raw record column.""" list_array_factory = (pa.LargeListArray.from_arrays if produce_large_types else pa.ListArray.from_arrays) binary_type = pa.large_binary() if produce_large_types else pa.binary() return list_array_factory( np.arange(0, len(raw_records) + 1, dtype=np.int64), pa.array(raw_records, type=binary_type))
def test_large_binary_huge(): s = b'xy' * 997 data = [s] * ((1 << 33) // len(s)) for type in [pa.large_binary(), pa.large_string()]: arr = pa.array(data, type=type) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: _check_roundtrip(table, use_dictionary=use_dictionary) del arr, table
def test_query_with_all_supported_types(self): record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2]], type=pa.list_(pa.int32())), pa.array([[10], [20]], type=pa.list_(pa.int64())), pa.array([[1.1], [2.2]], type=pa.list_(pa.float32())), pa.array([[10.1], [20.2]], type=pa.list_(pa.float64())), pa.array([['a'], ['b']], type=pa.list_(pa.string())), pa.array([['a+'], ['b+']], type=pa.list_(pa.large_string())), pa.array([[b'a_bytes'], [b'b_bytes']], type=pa.list_(pa.binary())), pa.array([[b'a_bytes+'], [b'b_bytes+']], type=pa.list_(pa.large_binary())), ], [ 'int32_list', 'int64_list', 'float32_list', 'float64_list', 'string_list', 'large_string_list', 'binary_list', 'large_binary_list', ]) sql = """ SELECT ARRAY( SELECT STRUCT(int32_list, int64_list, float32_list, float64_list, string_list, large_string_list, binary_list, large_binary_list) FROM example.int32_list, example.int64_list, example.float32_list, example.float64_list, example.string_list, example.large_string_list, example.binary_list, example.large_binary_list ) as slice_key FROM Examples as example;""" query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema) slices = query.Execute(record_batch) self.assertEqual(slices, [[[('int32_list', '1'), ('int64_list', '10'), ('float32_list', '1.1'), ('float64_list', '10.1'), ('string_list', 'a'), ('large_string_list', 'a+'), ('binary_list', 'a_bytes'), ('large_binary_list', 'a_bytes+')]], [[('int32_list', '2'), ('int64_list', '20'), ('float32_list', '2.2'), ('float64_list', '20.2'), ('string_list', 'b'), ('large_string_list', 'b+'), ('binary_list', 'b_bytes'), ('large_binary_list', 'b_bytes+')]]])
def test_simple(self, attach_raw_records): raw_record_column_name = "_raw_records" if attach_raw_records else None tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, _TELEMETRY_DESCRIPTORS, raw_record_column_name=raw_record_column_name) expected_fields = [ pa.field("st1", pa.list_(pa.binary())), pa.field("st2", pa.list_(pa.binary())), ] if attach_raw_records: raw_record_column_type = (pa.large_list(pa.large_binary()) if tfxio._can_produce_large_types else pa.list_(pa.binary())) expected_fields.append( pa.field(raw_record_column_name, raw_record_column_type)) self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)), tfxio.ArrowSchema()) self.assertEqual( tfxio.TensorRepresentations(), { "st1": text_format.Parse( """varlen_sparse_tensor { column_name: "st1" }""", schema_pb2.TensorRepresentation()), "st2": text_format.Parse( """varlen_sparse_tensor { column_name: "st2" }""", schema_pb2.TensorRepresentation()) }) tensor_adapter = tfxio.TensorAdapter() self.assertEqual(tensor_adapter.TypeSpecs(), _DecoderForTesting().output_type_specs()) def _assert_fn(list_of_rb): self.assertLen(list_of_rb, 1) rb = list_of_rb[0] self.assertTrue(rb.schema.equals(tfxio.ArrowSchema())) tensors = tensor_adapter.ToBatchTensors(rb) self.assertLen(tensors, 2) for tensor_name in ("st1", "st2"): self.assertIn(tensor_name, tensors) st = tensors[tensor_name] self.assertAllEqual(st.values, _RECORDS) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) p = beam.Pipeline() rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS)) beam_testing_util.assert_that(rb_pcoll, _assert_fn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tensor", "tfrecords_gzip")
def _tf_dtype_to_arrow_type(dtype: tf.DType) -> pa.DataType: """Maps a tf data type to a pyarrow data type.""" if dtype == tf.string: return pa.large_binary() elif dtype == tf.int64: return pa.int64() elif dtype == tf.float32: return pa.float32() else: raise TypeError('Unable to handle data type {}'.format(dtype))
def _resize_arrow_type(t): if t == pa.string(): return pa.large_string() if t == pa.utf8(): return pa.large_utf8() if t == pa.binary(): return pa.large_binary() if isinstance(t, pa.lib.ListType): return pa.large_list(t.value_type) return t
def test_large_binary_overflow(): s = b'x' * (1 << 31) arr = pa.array([s], type=pa.large_binary()) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: writer = pa.BufferOutputStream() with pytest.raises( pa.ArrowInvalid, match="Parquet cannot store strings with size 2GB or more"): _write_table(table, writer, use_dictionary=use_dictionary)
def ArrowSchema(self) -> pa.Schema: schema = self._ArrowSchemaNoRawRecordColumn() if self._raw_record_column_name is not None: if schema.get_field_index(self._raw_record_column_name) != -1: raise ValueError( "Raw record column name {} collided with a column in the schema." .format(self._raw_record_column_name)) schema = schema.append( pa.field(self._raw_record_column_name, pa.large_list(pa.large_binary()))) return schema
def test_sequence_bytes(): u1 = b'ma\xc3\xb1ana' data = [b'foo', u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] for ty in [None, pa.binary(), pa.large_binary()]: arr = pa.array(data, type=ty) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == ty or pa.binary() assert arr.to_pylist() == [b'foo', u1, b'bar', None]
def _map_arrow_type(arrow_type): arrow_to_dh = { pa.null(): '', pa.bool_(): '', pa.int8(): 'byte', pa.int16(): 'short', pa.int32(): 'int', pa.int64(): 'long', pa.uint8(): '', pa.uint16(): 'char', pa.uint32(): '', pa.uint64(): '', pa.float16(): '', pa.float32(): 'float', pa.float64(): 'double', pa.time32('s'): '', pa.time32('ms'): '', pa.time64('us'): '', pa.time64('ns'): 'io.deephaven.time.DateTime', pa.timestamp('us', tz=None): '', pa.timestamp('ns', tz=None): '', pa.date32(): 'java.time.LocalDate', pa.date64(): 'java.time.LocalDate', pa.binary(): '', pa.string(): 'java.lang.String', pa.utf8(): 'java.lang.String', pa.large_binary(): '', pa.large_string(): '', pa.large_utf8(): '', # decimal128(int precision, int scale=0) # list_(value_type, int list_size=-1) # large_list(value_type) # map_(key_type, item_type[, keys_sorted]) # struct(fields) # dictionary(index_type, value_type, …) # field(name, type, bool nullable = True[, metadata]) # schema(fields[, metadata]) # from_numpy_dtype(dtype) } dh_type = arrow_to_dh.get(arrow_type) if not dh_type: # if this is a case of timestamp with tz specified if isinstance(arrow_type, pa.TimestampType): dh_type = "io.deephaven.time.DateTime" if not dh_type: raise DHError(f'unsupported arrow data type : {arrow_type}') return {"deephaven:type": dh_type}
def test__type_to_expression(): assert "int" == _type_to_expression(pa.int32()) assert "datetime" == _type_to_expression(TRIAD_DEFAULT_TIMESTAMP) assert "timestamp(ns,America/New_York)" == _type_to_expression( pa.timestamp("ns", "America/New_York")) assert "timestamp(s)" == _type_to_expression(pa.timestamp("s")) assert "decimal(5)" == _type_to_expression(pa.decimal128(5)) assert "decimal(5,2)" == _type_to_expression(pa.decimal128(5, 2)) assert "bytes" == _type_to_expression(pa.binary()) assert "bytes" == _type_to_expression(pa.binary(-1)) raises(NotImplementedError, lambda: _type_to_expression(pa.binary(0))) raises(NotImplementedError, lambda: _type_to_expression(pa.binary(-2))) raises(NotImplementedError, lambda: _type_to_expression(pa.binary(1))) raises(NotImplementedError, lambda: _type_to_expression(pa.large_binary()))