def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', None, '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) field = pa.field('datetime64', pa.timestamp('ms')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=True, expected_schema=schema, ) df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) self._check_pandas_roundtrip( df, timestamps_to_ms=False, expected_schema=schema, )
def make_recordbatch(length): schema = pa.schema([pa.field('f0', pa.int16()), pa.field('f1', pa.int16())]) a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16)) batch = pa.RecordBatch.from_arrays([a0, a1], schema) return batch
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def test_field_add_remove_metadata(): import collections f0 = pa.field('foo', pa.int32()) assert f0.metadata is None metadata = {b'foo': b'bar', b'pandas': b'badger'} metadata2 = collections.OrderedDict([ (b'a', b'alpha'), (b'b', b'beta') ]) f1 = f0.add_metadata(metadata) assert f1.metadata == metadata f2 = f0.add_metadata(metadata2) assert f2.metadata == metadata2 with pytest.raises(TypeError): f0.add_metadata([1, 2, 3]) f3 = f1.remove_metadata() assert f3.metadata is None # idempotent f4 = f3.remove_metadata() assert f4.metadata is None f5 = pa.field('foo', pa.int32(), True, metadata) f6 = f0.add_metadata(metadata) assert f5.equals(f6)
def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table)
def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
def test_struct_array_field(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) x0 = a.field(0) y0 = a.field(1) x1 = a.field(-2) y1 = a.field(-1) x2 = a.field('x') y2 = a.field('y') assert isinstance(x0, pa.lib.Int16Array) assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) assert x0.equals(x2) assert y0.equals(y1) assert y0.equals(y2) for invalid_index in [None, pa.int16()]: with pytest.raises(TypeError): a.field(invalid_index) for invalid_index in [3, -3]: with pytest.raises(IndexError): a.field(invalid_index) for invalid_name in ['z', '']: with pytest.raises(KeyError): a.field(invalid_name)
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b
def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table)
def test_recordbatch_basics(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1']) assert not batch.schema.metadata assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) assert batch.to_pydict() == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, 5, 10]) ]) with pytest.raises(IndexError): # bounds checking batch[2] # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) assert batch.schema == schema
def test_field_metadata(): f1 = pa.field('a', pa.int8()) f2 = pa.field('a', pa.int8(), metadata={}) f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'}) assert f1.metadata is None assert f2.metadata == {} assert f3.metadata[b'bizz'] == b'bazz'
def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty)
def test_field_equality_operators(): f1 = pa.field('a', pa.int8(), nullable=True) f2 = pa.field('a', pa.int8(), nullable=True) f3 = pa.field('b', pa.int8(), nullable=True) f4 = pa.field('b', pa.int8(), nullable=False) assert f1 == f2 assert f1 != f3 assert f3 != f4 assert f1 != 'foo'
def test_fields_hashable(): in_dict = {} fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_field(): t = pa.string() f = pa.field('foo', t) assert f.name == 'foo' assert f.nullable assert f.type is t assert repr(f) == "pyarrow.Field<foo: string>" f = pa.field('foo', t, False) assert not f.nullable
def test_field(self): t = arrow.string() f = arrow.field('foo', t) assert f.name == 'foo' assert f.nullable assert f.type is t assert repr(f) == "Field('foo', type=string)" f = arrow.field('foo', t, False) assert not f.nullable
def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b
def test_struct_type(): fields = [ # Duplicate field name on purpose pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32()) ] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields assert ty[0].name == 'a' assert ty[2].type == pa.int32() with pytest.raises(IndexError): assert ty[3] assert ty['b'] == ty[2] # Duplicate with pytest.warns(UserWarning): with pytest.raises(KeyError): ty['a'] # Not found with pytest.raises(KeyError): ty['c'] # Neither integer nor string with pytest.raises(TypeError): ty[None] for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Invalid args with pytest.raises(TypeError): pa.struct([('a', None)])
def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def dataframe_with_lists(include_index=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_schema_equals_propagates_check_metadata(): # ARROW-4088 schema1 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()) ]) schema2 = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string(), metadata={'a': 'alpha'}), ]) assert not schema1.equals(schema2) assert schema1.equals(schema2, check_metadata=False)
def test_schema_repr_with_dictionaries(): fields = [ pa.field('one', pa.dictionary(pa.int16(), pa.string())), pa.field('two', pa.int32()) ] sch = pa.schema(fields) expected = ( """\ one: dictionary<values=string, indices=int16, ordered=0> two: int32""") assert repr(sch) == expected
def test_table_from_arrays_preserves_column_metadata(): # Added to test https://issues.apache.org/jira/browse/ARROW-3866 arr0 = pa.array([1, 2]) arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) columns = [ pa.column(field0, arr0), pa.column(field1, arr1) ] table = pa.Table.from_arrays(columns) assert b"a" in table.column(0).field.metadata assert table.column(1).field.nullable is False
def test_table_pickle(): data = [ pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()), pa.chunked_array([["some", "strings", None, ""]], type=pa.string()), ] schema = pa.schema([pa.field('ints', pa.uint32()), pa.field('strs', pa.string())], metadata={b'foo': b'bar'}) table = pa.Table.from_arrays(data, schema=schema) result = pickle.loads(pickle.dumps(table)) result._validate() assert result.equals(table)
def test_recordbatch_pickle(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] schema = pa.schema([pa.field('ints', pa.int8()), pa.field('floats', pa.float32()), ]).add_metadata({b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) result = pickle.loads(pickle.dumps(batch)) assert result.equals(batch) assert result.schema == schema
def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
def get_type_and_builtins(self, n, type_name): """ Return a `(arrow type, list)` tuple where the arrow type corresponds to the given logical *type_name*, and the list is a list of *n* random-generated Python objects compatible with the arrow type. """ size = None if type_name in ('bool', 'ascii', 'unicode', 'int64 list', 'struct'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): kind = 'fixed binary' size = int(type_name[6:]) assert size > 0 else: raise ValueError("unrecognized type %r" % (type_name, )) if kind in ('int', 'float'): ty = getattr(pa, type_name)() elif kind == 'bool': ty = pa.bool_() elif kind == 'fixed binary': ty = pa.binary(size) elif kind == 'varying binary': ty = pa.binary() elif kind in ('ascii', 'unicode'): ty = pa.string() elif kind == 'int64 list': ty = pa.list_(pa.int64()) elif kind == 'struct': ty = pa.struct([ pa.field('u', pa.int64()), pa.field('v', pa.float64()), pa.field('w', pa.bool_()) ]) factories = { 'int': self.generate_int_list, 'float': self.generate_float_list, 'bool': self.generate_bool_list, 'fixed binary': partial(self.generate_fixed_binary_list, size=size), 'varying binary': partial(self.generate_varying_binary_list, min_size=3, max_size=40), 'ascii': partial(self.generate_ascii_string_list, min_size=3, max_size=40), 'unicode': partial(self.generate_unicode_string_list, min_size=3, max_size=40), 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, } data = factories[kind](n) return ty, data
def make_meta(obj, origin, partition_keys=None): """ Create metadata object for DataFrame. .. note:: This function can, for convenience reasons, also be applied to schema objects in which case they are just returned. .. warning:: Information for categoricals will be stripped! :meth:`normalize_type` will be applied to normalize type information and :meth:`normalize_column_order` will be applied to to reorder column information. Parameters ---------- obj: Union[DataFrame, Schema] Object to extract metadata from. origin: str Origin of the schema data, used for debugging and error reporting. partition_keys: Union[None, List[str]] Partition keys used to split the dataset. Returns ------- schema: SchemaWrapper Schema information for DataFrame. """ if isinstance(obj, SchemaWrapper): return obj if isinstance(obj, pa.Schema): return SchemaWrapper(obj, origin) if not isinstance(obj, pd.DataFrame): raise ValueError( "Input must be a pyarrow schema, or a pandas dataframe") if ARROW_LARGER_EQ_0130: schema = pa.Schema.from_pandas(obj) else: table = pa.Table.from_pandas(obj) schema = table.schema del table pandas_metadata = _pandas_meta_from_schema(schema) # normalize types fields = dict([(field.name, field.type) for field in schema]) for cmd in pandas_metadata["columns"]: name = cmd.get("name") if name is None: continue field_name = cmd["field_name"] field_idx = schema.get_field_index(field_name) field = schema[field_idx] fields[field_name], cmd["pandas_type"], cmd["numpy_type"], cmd[ "metadata"] = normalize_type(field.type, cmd["pandas_type"], cmd["numpy_type"], cmd["metadata"]) metadata = schema.metadata metadata[b"pandas"] = _dict_to_binary(pandas_metadata) schema = pa.schema([pa.field(n, t) for n, t in fields.items()], metadata) return normalize_column_order(SchemaWrapper(schema, origin), partition_keys)
def get_pyarrow_translated_schema(string_schema): """ Converts string schema dict to pyarrow schema for writing to parquet. :param string_schema: :return: pyarrow schema """ def _bq_to_pa_type(field): """ A function to convert BigQuery types to pyarrow types. :param field (bigquery.schema.SchemaField) :return: pa.DataType """ type_conversions = { 'STRING': pa.string(), 'NUMERIC': pa.int64(), 'BYTE': None, 'INTEGER': pa.int64(), 'FLOAT': pa.float64(), 'BOOLEAN': pa.bool_(), 'TIMESTAMP': pa.timestamp('us'), 'DATE': pa.date32(), 'TIME': pa.time64('us'), 'DATETIME': pa.timestamp('us'), 'GEOGRAPHY': None, } try: if field['mode'] == 'REPEATED': if field['type'] == 'RECORD': nested_fields = field['fields'] # Recursively call to convert the next nested layer. return pa.list_( pa.struct([(fld['name'], _bq_to_pa_type(fld)) for fld in nested_fields])) else: return pa.list_( _bq_to_pa_type(type_conversions[field['type']])) elif field['type'] == 'RECORD': nested_fields = field['fields'] # Recursively call to convert the next nested layer. return pa.struct([(fld['name'], _bq_to_pa_type(fld)) for fld in nested_fields]) else: return type_conversions.get(field.get('type')) except KeyError as err: raise KeyError( """Type {} is not a valid BigQuery type and not supported by this utility.""".format(field['type'])) pa_schema_list = [] for field in string_schema['fields']: field_type = field['type'] field_name = field['name'] field_mode = field['mode'] converted_type = _bq_to_pa_type(field) if converted_type is None: error_message = 'Error: json schema included a {0:s} field. ' \ 'BYTE, and GEOGRAPHY types cannot ' \ 'currently be used when outputting to ' \ 'parquet.'.format(field_type) logging.error(error_message) raise ValueError(error_message) else: nullable = False if field_mode == 'REQUIRED' else True pa_field = pa.field(name=field_name, type=converted_type #nullable=nullable ) pa_schema_list.append(pa_field) return pa.schema(pa_schema_list)
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols and ncols > 1. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100 and ncols > 1: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {!s} with type {!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result def _can_definitely_zero_copy(arr): return (isinstance(arr, np.ndarray) and arr.flags.contiguous and issubclass(arr.dtype.type, np.integer)) if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: arrays = [] with futures.ThreadPoolExecutor(nthreads) as executor: for c, f in zip(columns_to_convert, convert_fields): if _can_definitely_zero_copy(c.values): arrays.append(convert_column(c, f)) else: arrays.append(executor.submit(convert_column, c, f)) for i, maybe_fut in enumerate(arrays): if isinstance(maybe_fut, futures.Future): arrays[i] = maybe_fut.result() types = [x.type for x in arrays] if schema is None: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) pandas_metadata = construct_metadata(columns_to_convert, df, column_names, index_columns, index_descriptors, preserve_index, types) metadata = deepcopy(schema.metadata) if schema.metadata else dict() metadata.update(pandas_metadata) schema = schema.with_metadata(metadata) # If dataframe is empty but with RangeIndex -> # remember the length of the indexes n_rows = None if len(arrays) == 0: try: kind = index_descriptors[0]["kind"] if kind == "range": start = index_descriptors[0]["start"] stop = index_descriptors[0]["stop"] step = index_descriptors[0]["step"] n_rows = len(range(start, stop, step)) except IndexError: pass return arrays, schema, n_rows
def generate( path, parameters, format={ "name": "parquet", "row_group_size": 64 }, use_threads=True, ): """ Generate dataset using given parameters and write to given format Parameters ---------- path : str or file-like object Path to write to parameters : Parameters Parameters specifying how to randomly generate data format : Dict Format to write """ # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) column_seeds = np.arange(len(parameters.column_parameters)) np.random.shuffle(column_seeds) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): column_params.generator = column_params.generator( Generic("en", seed=column_seeds[i])) # Get schema for each column schema = pa.schema([ pa.field( name=str(i), type=pa.from_numpy_dtype(type(next(iter( column_params.generator)))), nullable=column_params.null_frequency > 0, ) for i, column_params in enumerate(parameters.column_parameters) ]) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) # Write _write(tbl, path, format)
def test_empty_table(): schema = pa.schema([pa.field('oneField', pa.int64())]) table = schema.empty_table() assert isinstance(table, pa.Table) assert table.num_rows == 0 assert table.schema == schema
('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde'])]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) @pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): result = pickle.loads(pickle.dumps(array, proto)) assert array.equals(result) @h.given( past.arrays(past.all_types, size=st.integers(min_value=0, max_value=10)))
def test_fixed_size_bytes_does_not_accept_varying_lengths(self): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) with self.assertRaises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema)
def __init__( self, schema: Optional[pa.Schema] = None, features: Optional[Features] = None, path: Optional[str] = None, stream: Optional[pa.NativeFile] = None, fingerprint: Optional[str] = None, writer_batch_size: Optional[int] = None, hash_salt: Optional[str] = None, check_duplicates: Optional[bool] = False, disable_nullable: bool = False, update_features: bool = False, with_metadata: bool = True, unit: str = "examples", ): if path is None and stream is None: raise ValueError( "At least one of path and stream must be provided.") if features is not None: self._features = features self._schema = pa.schema(features.type) elif schema is not None: self._schema: pa.Schema = schema self._features = Features.from_arrow_schema(self._schema) else: self._features = None self._schema = None if hash_salt is not None: # Create KeyHasher instance using split name as hash salt self._hasher = KeyHasher(hash_salt) else: self._hasher = KeyHasher("") self._check_duplicates = check_duplicates if disable_nullable and self._schema is not None: self._schema = pa.schema( pa.field(field.name, field.type, nullable=False) for field in self._schema) self._path = path if stream is None: self.stream = pa.OSFile(self._path, "wb") self._closable_stream = True else: self.stream = stream self._closable_stream = False self.fingerprint = fingerprint self.disable_nullable = disable_nullable self.writer_batch_size = writer_batch_size or config.DEFAULT_MAX_BATCH_SIZE self.update_features = update_features self.with_metadata = with_metadata self.unit = unit self._num_examples = 0 self._num_bytes = 0 self.current_examples: List[Tuple[Dict[str, Any], str]] = [] self.current_rows: List[pa.Table] = [] self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None self.hkey_record = []
def _to_arrow_schema(row_type): return pa.schema([ pa.field(n, to_arrow_type(t), t._nullable) for n, t in zip(row_type.field_names(), row_type.field_types()) ])
def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([]) schema = pa.schema([pa.field('f0', pa.utf8())]) with pytest.raises(ValueError): pa.record_batch([arr], schema=schema)
pa.array([[1], None, [3, 4]], type=pa.list_(pa.int32())) ], ["f1", "f2"]) }, { "list<utf8>": pa.array([u"abc", None], type=pa.utf8()) }], expected_output={ "list<utf8>": pa.array([None, None, None, None, None, None, None, u"abc", None], type=pa.utf8()), "struct<int32, list<int32>>": pa.array([ None, None, None, None, (1, [1]), (2, None), (None, [3, 4]), None, None ], type=pa.struct([ pa.field("f1", pa.int32()), pa.field("f2", pa.list_(pa.int32())) ])), }), ] _MERGE_INVALID_INPUT_TEST_CASES = [ dict( testcase_name="not_a_list_of_tables", inputs=[pa.Table.from_arrays([pa.array([1])], ["f1"]), 1], expected_error_regexp="incompatible function arguments", ), dict( testcase_name="not_a_list", inputs=1, expected_error_regexp="incompatible function arguments",
def test_arrow_schema_convertion(self): arrow_schema = pa.schema([ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()), pa.field('timestamp_ns', pa.timestamp('ns')), ]) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert isinstance(getattr(unischema, name).codec, ScalarCodec) if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.large_list(pa.uint16()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()))
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {0!s} with type {1!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_fields)) types = [x.type for x in arrays] if schema is not None: # add index columns index_types = types[len(column_names):] for name, type_ in zip(index_column_names, index_types): name = name if name is not None else 'None' schema = schema.append(pa.field(name, type_)) else: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) schema = schema.with_metadata(metadata) return arrays, schema
def _to_arrow_type(field): if field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TINYINT: return pa.field(field.name, pa.int8(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.SMALLINT: return pa.field(field.name, pa.int16(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.INT: return pa.field(field.name, pa.int32(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BIGINT: return pa.field(field.name, pa.int64(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BOOLEAN: return pa.field(field.name, pa.bool_(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.FLOAT: return pa.field(field.name, pa.float32(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DOUBLE: return pa.field(field.name, pa.float64(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARCHAR: return pa.field(field.name, pa.utf8(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARBINARY: return pa.field(field.name, pa.binary(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DECIMAL: return pa.field( field.name, pa.decimal128(field.type.decimal_info.precision, field.type.decimal_info.scale), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DATE: return pa.field(field.name, pa.date32(), field.type.nullable) elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TIME: if field.type.time_info.precision == 0: return pa.field(field.name, pa.time32('s'), field.type.nullable) elif 1 <= field.type.time_type.precision <= 3: return pa.field(field.name, pa.time32('ms'), field.type.nullable) elif 4 <= field.type.time_type.precision <= 6: return pa.field(field.name, pa.time64('us'), field.type.nullable) else: return pa.field(field.name, pa.time64('ns'), field.type.nullable) else: raise ValueError("field_type %s is not supported." % field.type)
assert not pa.types.is_float_value(1) assert pa.types.is_float_value(1.) assert pa.types.is_float_value(np.float64(1)) assert not pa.types.is_float_value('1.0') def test_is_boolean_value(): assert not pa.types.is_boolean_value(1) assert pa.types.is_boolean_value(True) assert pa.types.is_boolean_value(False) assert pa.types.is_boolean_value(np.bool_(True)) assert pa.types.is_boolean_value(np.bool_(False)) @h.given(past.all_types | past.all_fields | past.all_schemas) @h.example(pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})) def test_pickling(field): data = pickle.dumps(field) assert pickle.loads(data) == field @h.given( st.lists(past.all_types) | st.lists(past.all_fields) | st.lists(past.all_schemas)) def test_hashing(items): h.assume( # well, this is still O(n^2), but makes the input unique all(not a.equals(b) for i, a in enumerate(items) for b in items[:i])) container = {} for i, item in enumerate(items):
def clone_field(table, name, datatype): f = table.schema.field_by_name(name) return pa.field(f.name, datatype, f.nullable, f.metadata)
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected_value_type = pyarrow.struct(( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), )) assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) assert actual.value_type.num_children == len(fields) assert actual.value_type.equals(expected_value_type)