def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_struct_type(): fields = [ # Duplicate field name on purpose pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32()) ] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields assert ty[0].name == 'a' assert ty[2].type == pa.int32() with pytest.raises(IndexError): assert ty[3] assert ty['b'] == ty[2] # Duplicate with pytest.warns(UserWarning): with pytest.raises(KeyError): ty['a'] # Not found with pytest.raises(KeyError): ty['c'] # Neither integer nor string with pytest.raises(TypeError): ty[None] for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Invalid args with pytest.raises(TypeError): pa.struct([('a', None)])
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_struct_array_field(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) x0 = a.field(0) y0 = a.field(1) x1 = a.field(-2) y1 = a.field(-1) x2 = a.field('x') y2 = a.field('y') assert isinstance(x0, pa.lib.Int16Array) assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) assert x0.equals(x2) assert y0.equals(y1) assert y0.equals(y2) for invalid_index in [None, pa.int16()]: with pytest.raises(TypeError): a.field(invalid_index) for invalid_index in [3, -3]: with pytest.raises(IndexError): a.field(invalid_index) for invalid_name in ['z', '']: with pytest.raises(KeyError): a.field(invalid_name)
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [{'a': 5, 'b': u'foo', 'c': True}, {'a': 6, 'b': u'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': u'bar', 'c': None}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def test_table_flatten(): ty1 = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) ty2 = pa.struct([pa.field('nest', ty1)]) a = pa.array([(1, 2.5), (3, 4.5)], type=ty1) b = pa.array([((11, 12.5),), ((13, 14.5),)], type=ty2) c = pa.array([False, True], type=pa.bool_()) table = pa.Table.from_arrays([a, b, c], names=['a', 'b', 'c']) t2 = table.flatten() t2._validate() expected = pa.Table.from_arrays([ pa.array([1, 3], type=pa.int16()), pa.array([2.5, 4.5], type=pa.float32()), pa.array([(11, 12.5), (13, 14.5)], type=ty1), c], names=['a.x', 'a.y', 'b.nest', 'c']) assert t2.equals(expected)
def test_field_flatten(): f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'}) assert f0.flatten() == [f0] f1 = pa.field('bar', pa.float64(), nullable=False) ff = pa.field('ff', pa.struct([f0, f1]), nullable=False) assert ff.flatten() == [ pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}), pa.field('ff.bar', pa.float64(), nullable=False)] # XXX # Nullable parent makes flattened child nullable ff = pa.field('ff', pa.struct([f0, f1])) assert ff.flatten() == [ pa.field('ff.foo', pa.int32()).add_metadata({b'foo': b'bar'}), pa.field('ff.bar', pa.float64())] fff = pa.field('fff', pa.struct([ff])) assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))]
def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty)
def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b
def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def bq_to_arrow_struct_data_type(field): arrow_fields = [] for subfield in field.fields: arrow_subfield = bq_to_arrow_field(subfield) if arrow_subfield: arrow_fields.append(arrow_subfield) else: # Could not determine a subfield type. Fallback to type # inference. return None return pyarrow.struct(arrow_fields)
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_column_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16())) assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5], type=pa.float32())) # Empty column a = pa.array([], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([], type=pa.int16())) assert y == pa.column('foo.y', pa.array([], type=pa.float32()))
def test_struct_value_subscripting(self): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[0]['x'] == 1 assert arr[0]['y'] == 2.5 assert arr[1]['x'] == 3 assert arr[1]['y'] == 4.5 assert arr[2]['x'] == 5 assert arr[2]['y'] == 6.5 with pytest.raises(IndexError): arr[4]['non-existent'] with pytest.raises(KeyError): arr[0]['non-existent']
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def test_struct_from_dicts(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) arr = pa.array([], type=ty) assert arr.to_pylist() == [] data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}] arr = pa.array(data, type=ty) expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': 'bar', 'c': None}] assert arr.to_pylist() == expected
def test_struct_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) xs, ys = a.flatten() assert xs.type == pa.int16() assert ys.type == pa.float32() assert xs.to_pylist() == [1, 3, 5] assert ys.to_pylist() == [2.5, 4.5, 6.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [3, 5] assert ys.to_pylist() == [4.5, 6.5] a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, None, 3] assert ys.to_pylist() == [2.5, None, 4.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [None, 3] assert ys.to_pylist() == [None, 4.5] a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, 2, None] assert ys.to_pylist() == [None, 3.5, 4.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [2, None] assert ys.to_pylist() == [3.5, 4.5] a = pa.array([(1, None), None, (None, 2.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, None, None] assert ys.to_pylist() == [None, None, 2.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [None, None] assert ys.to_pylist() == [None, 2.5]
def __call__(self): return pa.struct({"language": pa.list_(pa.string()), "translation": pa.list_(pa.string())})
def get_type_and_builtins(self, n, type_name): """ Return a `(arrow type, list)` tuple where the arrow type corresponds to the given logical *type_name*, and the list is a list of *n* random-generated Python objects compatible with the arrow type. """ size = None if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' elif type_name.startswith('struct'): kind = 'struct' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): kind = 'fixed binary' size = int(type_name[6:]) assert size > 0 else: raise ValueError("unrecognized type %r" % (type_name,)) if kind in ('int', 'float'): ty = getattr(pa, type_name)() elif kind == 'bool': ty = pa.bool_() elif kind == 'decimal': ty = pa.decimal128(9, 9) elif kind == 'fixed binary': ty = pa.binary(size) elif kind == 'varying binary': ty = pa.binary() elif kind in ('ascii', 'unicode'): ty = pa.string() elif kind == 'int64 list': ty = pa.list_(pa.int64()) elif kind == 'struct': ty = pa.struct([pa.field('u', pa.int64()), pa.field('v', pa.float64()), pa.field('w', pa.bool_())]) factories = { 'int': self.generate_int_list, 'float': self.generate_float_list, 'bool': self.generate_bool_list, 'decimal': self.generate_decimal_list, 'fixed binary': partial(self.generate_fixed_binary_list, size=size), 'varying binary': partial(self.generate_varying_binary_list, min_size=3, max_size=40), 'ascii': partial(self.generate_ascii_string_list, min_size=3, max_size=40), 'unicode': partial(self.generate_unicode_string_list, min_size=3, max_size=40), 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, 'struct from tuples': self.generate_tuple_list, } data = factories[kind](n) return ty, data
} """ ], create_expected=lambda list_factory, binary_type: pa.RecordBatch. from_arrays([ pa.array([None], type=list_factory(binary_type)), ], ["context_a"])), dict( testcase_name="feature_lists_with_no_sequence_features", schema_text_proto=None, sequence_examples_text_proto=[""" feature_lists {} """], create_expected=lambda list_factory, binary_type: pa.RecordBatch. from_arrays([ pa.StructArray.from_buffers(pa.struct([]), 1, [None]), ], [_TEST_SEQUENCE_COLUMN_NAME])), dict( testcase_name="without_schema_only_context_features", schema_text_proto=None, sequence_examples_text_proto=[ """ context { feature { key: 'context_a' value { int64_list { value: [1, 2] } } }
pa.array([[1], None, [3, 4]], type=pa.list_(pa.int32())) ], ["f1", "f2"]) }, { "list<utf8>": pa.array([u"abc", None], type=pa.utf8()) }], expected_output={ "list<utf8>": pa.array([None, None, None, None, None, None, None, u"abc", None], type=pa.utf8()), "struct<int32, list<int32>>": pa.array([ None, None, None, None, (1, [1]), (2, None), (None, [3, 4]), None, None ], type=pa.struct([ pa.field("f1", pa.int32()), pa.field("f2", pa.list_(pa.int32())) ])), }), ] _MERGE_INVALID_INPUT_TEST_CASES = [ dict( testcase_name="not_a_list_of_tables", inputs=[pa.Table.from_arrays([pa.array([1])], ["f1"]), 1], expected_error_regexp="incompatible function arguments", ), dict( testcase_name="not_a_list", inputs=1, expected_error_regexp="incompatible function arguments", ),
) data_x = ( "data/1KGP/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz" ) data_y = "data/1KGP/ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz" chrom = 20 vfname = data_t.format(chrom) vf = VariantFile(vfname, mode="r", threads=4) hdr, samples = get_header(vf) vf.close() # hdr.to_parquet("data/1KGP/pq/chr20-header.parquet") cols = get_vcf_cols(hdr, samples) schema = pa.schema(cols) struct = pa.struct(cols) # run in an asyncio event loop, have better control. # wait for batches of 4, append to parquet file loop = asyncio.get_event_loop() batchsize = 4 # ncores/2 tproc, tio = 0, 0 # timing pqwriter = pq.ParquetWriter("test.parquet", schema, flavor="spark") for i in range(0, 316, batchsize): with ProcessPoolExecutor(4) as executor: t0 = time.process_time() tasks = [ loop.run_in_executor( executor, to_arrow, vfname,
ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate() assert result.equals(array) @pytest.mark.pandas def test_chunked_array_to_pandas(): data = [pa.array([-10, -5, 0, 5, 10])]
def __call__(self): return pa.struct({lang: pa.string() for lang in self.languages})
assert result.equals(expected) @pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'),
('uint8', np.arange(5)), ('int8', np.arange(5)), ('uint16', np.arange(5)), ('int16', np.arange(5)), ('uint32', np.arange(5)), ('int32', np.arange(5)), ('uint64', np.arange(5, 10)), ('int64', np.arange(5, 10)), ('float', np.arange(0, 0.5, 0.1)), ('double', np.arange(0, 0.5, 0.1)), ('string', ['a', 'b', None, 'ddd', 'ee']), ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [{ 'a': 1, 'b': 2 }, None, { 'a': 3, 'b': 4 }, None, { 'a': 5, 'b': 6 }]), ] numerical_arrow_types = [ pa.int8(), pa.int16(), pa.int64(),
("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")), ("timestamp(ns)", pa.timestamp("ns")), ("date32", pa.date32()), ("date64", pa.date64()), ("string", pa.string()), ("large_string", pa.large_string()), ("utf8", pa.utf8()), ("large_utf8", pa.large_utf8()), ("binary", pa.binary()), ("binary(128)", pa.binary(128)), ("large_binary", pa.large_binary()), ("struct<num:int64>", pa.struct([("num", pa.int64())])), ("list_<int64>", pa.list_(pa.int64())), ("list_<list_<int64>>", pa.list_(pa.list_(pa.int64()))), ("large_list<int64>", pa.large_list(pa.int64())), ("large_list<large_list<int64>>", pa.large_list(pa.large_list(pa.int64()))), ( "struct<num:int64, newnum:int64>", pa.struct([("num", pa.int64()), ("newnum", pa.int64())]), ), ( "struct<num:int64, arr:list_<int64>>", pa.struct([("num", pa.int64()), ("arr", pa.list_(pa.int64()))]), ), ( "list_<struct<num:int64,desc:string>>",
pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ] def test_is_boolean(): assert types.is_boolean(pa.bool_()) assert not types.is_boolean(pa.int8())