def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_fixed_size_bytes(self): data = [b'foof', None, b'barb', b'2346'] arr = pa.from_pylist(data, type=pa.binary(4)) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.binary(4) assert arr.to_pylist() == data
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_sequence_fixed_size_bytes(): data = [b'foof', None, bytearray(b'barb'), b'2346'] arr = pa.array(data, type=pa.binary(4)) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.binary(4) assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def field(jvm_field): """ Construct a Field from a org.apache.arrow.vector.types.pojo.Field instance. Parameters ---------- jvm_field: org.apache.arrow.vector.types.pojo.Field Returns ------- pyarrow.Field """ name = jvm_field.getName() jvm_type = jvm_field.getType() typ = None if not jvm_type.isComplex(): type_str = jvm_type.getTypeID().toString() if type_str == 'Null': typ = pa.null() elif type_str == 'Int': typ = _from_jvm_int_type(jvm_type) elif type_str == 'FloatingPoint': typ = _from_jvm_float_type(jvm_type) elif type_str == 'Utf8': typ = pa.string() elif type_str == 'Binary': typ = pa.binary() elif type_str == 'FixedSizeBinary': typ = pa.binary(jvm_type.getByteWidth()) elif type_str == 'Bool': typ = pa.bool_() elif type_str == 'Time': typ = _from_jvm_time_type(jvm_type) elif type_str == 'Timestamp': typ = _from_jvm_timestamp_type(jvm_type) elif type_str == 'Date': typ = _from_jvm_date_type(jvm_type) elif type_str == 'Decimal': typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale()) else: raise NotImplementedError( "Unsupported JVM type: {}".format(type_str)) else: # TODO: The following JVM types are not implemented: # Struct, List, FixedSizeList, Union, Dictionary raise NotImplementedError( "JVM field conversion only implemented for primitive types.") nullable = jvm_field.isNullable() if jvm_field.getMetadata().isEmpty(): metadata = None else: metadata = dict(jvm_field.getMetadata()) return pa.field(name, typ, nullable, metadata)
def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) assert types.is_string(pa.string()) assert types.is_unicode(pa.string()) assert not types.is_string(pa.binary()) assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary())
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_sequence_bytes(): u1 = b'ma\xc3\xb1ana' data = [b'foo', u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] for ty in [None, pa.binary()]: arr = pa.array(data, type=ty) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pa.binary() assert arr.to_pylist() == [b'foo', u1, b'bar', None]
def test_array_mixed_unicode_bytes(): values = [u'qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] u_values = [u'qux', u'foo', u'barz'] arr = pa.array(values) expected = pa.array(b_values, type=pa.binary()) assert arr.type == pa.binary() assert arr.equals(expected) arr = pa.array(values, type=pa.string()) expected = pa.array(u_values, type=pa.string()) assert arr.type == pa.string() assert arr.equals(expected)
def numpy_array_from_arrow_array(arrow_array): arrow_type = arrow_array.type buffers = arrow_array.buffers() assert len(buffers) == 2 bitmap_buffer, data_buffer = buffers if isinstance(arrow_type, type(pyarrow.binary(1))): # todo, is there a better way to typecheck? # mimics python/pyarrow/array.pxi::Array::to_numpy assert len(buffers) == 2 dtype = "S" + str(arrow_type.byte_width) # arrow seems to do padding, check if it is all ok expected_length = arrow_type.byte_width * len(arrow_array) actual_length = len(buffers[-1]) if actual_length < expected_length: raise ValueError('buffer is smaller (%d) than expected (%d)' % (actual_length, expected_length)) array = np.frombuffer(buffers[-1], dtype, len(arrow_array))# TODO: deal with offset ? [arrow_array.offset:arrow_array.offset + len(arrow_array)] else: dtype = arrow_array.type.to_pandas_dtype() if np.bool_ == dtype: # TODO: this will also be a copy, we probably want to support bitmasks as well bitmap = np.frombuffer(data_buffer, np.uint8, len(data_buffer)) array = numpy_mask_from_arrow_mask(bitmap, len(arrow_array)) else: array = np.frombuffer(data_buffer, dtype, len(arrow_array)) if bitmap_buffer is not None: bitmap = np.frombuffer(bitmap_buffer, np.uint8, len(bitmap_buffer)) mask = numpy_mask_from_arrow_mask(bitmap, len(arrow_array)) array = np.ma.MaskedArray(array, mask=mask) return array
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_fixed_size_binary(): t0 = pa.binary(10) data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo'] a0 = pa.array(data, type=t0) table = pa.Table.from_arrays([a0], ['binary[10]']) _check_roundtrip(table)
def test_bytes(self): u1 = b"ma\xc3\xb1ana" data = [b"foo", u1.decode("utf-8"), None] # unicode gets encoded, arr = pyarrow.from_pylist(data) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pyarrow.binary() assert arr.to_pylist() == [b"foo", u1, None]
def test_fixed_size_bytes(self): values = [b'foo', None, b'bar', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) table = pa.Table.from_pandas(df, schema=schema) assert table.schema[0].type == schema[0].type assert table.schema[0].name == schema[0].name result = table.to_pandas() tm.assert_frame_equal(result, df)
def test_cast_binary_to_utf8(): binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) utf8_arr = binary_arr.cast(pa.utf8()) expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) assert utf8_arr.equals(expected) non_utf8_values = [(u'maรฑana').encode('utf-16-le')] non_utf8_binary = pa.array(non_utf8_values) assert non_utf8_binary.type == pa.binary() with pytest.raises(ValueError): non_utf8_binary.cast(pa.string()) non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), type=pa.binary()) # No error casted = non_utf8_all_null.cast(pa.string()) assert casted.null_count == 1
def test_bytes(self): u1 = b'ma\xc3\xb1ana' data = [b'foo', u1.decode('utf-8'), # unicode gets encoded, None] arr = pyarrow.from_pylist(data) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pyarrow.binary() assert arr.to_pylist() == [b'foo', u1, None]
def test_bytes_to_binary(self): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] df = pd.DataFrame({'strings': values}) table = pa.Table.from_pandas(df) assert table[0].type == pa.binary() values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] expected = pd.DataFrame({'strings': values2}) self._check_pandas_roundtrip(df, expected)
def dataframe_with_lists(include_index=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"รค"], None, [u"1"], [u"1", u"2", u"3"], [], ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_fixed_size_bytes(self): data = [b'foof', None, b'barb'] arr = pa.array(data, type=pa.binary(4)) v = arr[0] assert isinstance(v, pa.FixedSizeBinaryValue) assert v.as_py() == b'foof' assert arr[1] is pa.NA v = arr[2].as_py() assert v == b'barb' assert isinstance(v, bytes)
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def allocate_bytes(pool, nbytes): """ Temporarily allocate *nbytes* from the given *pool*. """ arr = pa.array([b"x" * nbytes], type=pa.binary(), memory_pool=pool) # Fetch the values buffer from the varbinary array and release the rest, # to get the desired allocation amount buf = arr.buffers()[2] arr = None assert len(buf) == nbytes try: yield finally: buf = None
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def arrow_array_from_numpy_array(array): dtype = array.dtype mask = None if np.ma.isMaskedArray(array): mask = array.mask if dtype.kind == 'S': type = pyarrow.binary(dtype.itemsize) arrow_array = pyarrow.array(array, type, mask=mask) else: if dtype.isnative: arrow_array = pyarrow.array(array, mask=mask) else: # TODO: we copy here, but I guess we should not... or give some warning arrow_array = pyarrow.array(array.astype(dtype.newbyteorder('=')), mask=mask) return arrow_array
def test_union_type(): def check_fields(ty, fields): assert ty.num_children == len(fields) assert [ty[i] for i in range(ty.num_children)] == fields fields = [pa.field('x', pa.list_(pa.int32())), pa.field('y', pa.binary())] for mode in ('sparse', pa.lib.UnionMode_SPARSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'sparse' check_fields(ty, fields) for mode in ('dense', pa.lib.UnionMode_DENSE): ty = pa.union(fields, mode=mode) assert ty.mode == 'dense' check_fields(ty, fields) for mode in ('unknown', 2): with pytest.raises(ValueError, match='Invalid union mode'): pa.union(fields, mode=mode)
def test_simple_nulls(self): # Infer various kinds of data, with nulls rows = (b"a,b,c,d,e\n" b"1,2,,,3\n" b"nan,-5,foo,,nan\n" b"4.5,#N/A,nan,,\xff\n") table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.null()), ('e', pa.binary())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, None, 4.5], 'b': [2, -5, None], 'c': [u"", u"foo", u"nan"], 'd': [None, None, None], 'e': [b"3", b"nan", b"\xff"], }
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
@pytest.mark.parametrize( ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count', 'num_values', 'distinct_count'), [ ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0), (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0), ([True, False, False, True, True ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0), ([b'\x00', b'b', b'12', None, b'aaa' ], pa.binary(), 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0), ]) def test_parquet_column_statistics_api(data, type, physical_type, min_value, max_value, null_count, num_values, distinct_count): df = pd.DataFrame({'data': data}) schema = pa.schema([pa.field('data', type)]) table = pa.Table.from_pandas(df, schema=schema, safe=False) fileh = make_sample_file(table) meta = fileh.metadata
def test_stats_pipeline_with_examples_with_no_values(self): record_batches = [ pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']) ] expected_result = text_format.Parse( """ datasets{ num_examples: 3 features { path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'b' } type: STRING string_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'c' } type: INT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'w' } type: INT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 3 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6.0 avg_num_values: 1.0 tot_num_values: 6.0 } } mean: 2.0 std_dev: 0.0 min: 2.0 max: 2.0 median: 2.0 histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 3.0 } type: STANDARD } histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 3.0 } type: QUANTILES } weighted_numeric_stats { mean: 2.0 median: 2.0 histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 6.0 } type: STANDARD } histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 6.0 } type: QUANTILES } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = ( p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_sql(parameters, db_type): df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}") wr.db.to_sql( df=df, con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", index=False, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=parameters[db_type]["host"], port=parameters[db_type]["port"], database=parameters[db_type]["database"], user=parameters["user"], password=parameters["password"], ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {parameters[db_type]['schema']}.test_sql", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name="test_sql", schema=parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table="test_sql", schema=schema, index_col="index") assert len(df.index) == 3 assert len(df.columns) == 1
MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
import hypothesis as h import hypothesis.strategies as st import hypothesis.extra.numpy as npst import hypothesis.extra.pytz as tzst import numpy as np import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) fixed_size_binary_type = st.builds(pa.binary, st.integers(min_value=0, max_value=16)) binary_like_types = st.one_of(binary_type, string_type, large_binary_type, large_string_type, fixed_size_binary_type) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types)
def popbuffers(tpe, buffers, length): if isinstance(tpe, pyarrow.lib.DictionaryType): index = popbuffers(tpe.index_type, buffers, length) content = fromarrow(tpe.dictionary) if isinstance(index, awkwardlib.BitMaskedArray): return awkwardlib.BitMaskedArray(index.mask, awkwardlib.IndexedArray( index.content, content), maskedwhen=index.maskedwhen, lsborder=index.lsborder) else: return awkwardlib.IndexedArray(index, content) elif isinstance(tpe, pyarrow.lib.StructType): mask = buffers.pop(0) pairs = [] for i in range(tpe.num_children): pairs.append( (tpe[i].name, popbuffers(tpe[i].type, buffers, length))) out = awkwardlib.Table.frompairs(pairs, 0) # FIXME: better rowstart if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif isinstance(tpe, pyarrow.lib.ListType): mask = buffers.pop(0) offsets = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1] content = popbuffers(tpe.value_type, buffers, offsets[-1]) out = awkwardlib.JaggedArray.fromoffsets(offsets, content) if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "sparse": mask = buffers.pop(0) tags = awkwardlib.numpy.frombuffer(buffers.pop(0), dtype=ARROW_TAGTYPE)[:length] assert buffers.pop(0) is None index = awkwardlib.numpy.arange(len(tags), dtype=ARROW_INDEXTYPE) contents = [] for i in range(tpe.num_children): try: sublength = index[tags == i][-1] + 1 except IndexError: sublength = 0 contents.append(popbuffers(tpe[i].type, buffers, sublength)) for i in range(len(contents)): these = index[tags == i] if len(these) == 0: contents[i] = contents[i][0:0] else: contents[i] = contents[i][:these[-1] + 1] out = awkwardlib.UnionArray(tags, index, contents) if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "dense": mask = buffers.pop(0) tags = awkwardlib.numpy.frombuffer(buffers.pop(0), dtype=ARROW_TAGTYPE)[:length] index = awkwardlib.numpy.frombuffer(buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length] contents = [] for i in range(tpe.num_children): try: sublength = index[tags == i].max() + 1 except ValueError: sublength = 0 contents.append(popbuffers(tpe[i].type, buffers, sublength)) for i in range(len(contents)): these = index[tags == i] if len(these) == 0: contents[i] = contents[i][0:0] else: contents[i] = contents[i][:these.max() + 1] out = awkwardlib.UnionArray(tags, index, contents) if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif tpe == pyarrow.string(): mask = buffers.pop(0) offsets = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1] content = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_CHARTYPE)[:offsets[-1]] out = awkwardlib.StringArray.fromoffsets(offsets, content[:offsets[-1]], encoding="utf-8") if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif tpe == pyarrow.binary(): mask = buffers.pop(0) offsets = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_INDEXTYPE)[:length + 1] content = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_CHARTYPE)[:offsets[-1]] out = awkwardlib.StringArray.fromoffsets(offsets, content[:offsets[-1]], encoding=None) if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif tpe == pyarrow.bool_(): mask = buffers.pop(0) out = awkwardlib.numpy.unpackbits( awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=ARROW_CHARTYPE)).view( awkwardlib.MaskedArray.BOOLTYPE) out = out.reshape(-1, 8)[:, ::-1].reshape(-1)[:length] # lsborder=True if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out elif isinstance(tpe, pyarrow.lib.DataType): mask = buffers.pop(0) out = awkwardlib.numpy.frombuffer( buffers.pop(0), dtype=tpe.to_pandas_dtype())[:length] if mask is not None: mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) return awkwardlib.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) else: return out else: raise NotImplementedError(repr(tpe))
result = arr.cast('i8') assert result.equals(expected) @pytest.mark.parametrize(('ty', 'values'), [('bool', [True, False, True]), ('uint8', range(0, 255)), ('int8', range(0, 128)), ('uint16', range(0, 10)), ('int16', range(0, 10)), ('uint32', range(0, 10)), ('int32', range(0, 10)), ('uint64', range(0, 10)), ('int64', range(0, 10)), ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde'])]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))])
def check_example_batch(batch): arr = batch.column(0) assert isinstance(arr, pa.ExtensionArray) assert arr.type.storage_type == pa.binary(3) assert arr.storage.to_pylist() == [b"foo", b"bar"] return arr
def example_batch(): ty = ParamExtType(3) storage = pa.array([b"foo", b"bar"], type=pa.binary(3)) arr = pa.ExtensionArray.from_storage(ty, storage) return pa.RecordBatch.from_arrays([arr], ["exts"])
def test_fixed_size_bytes_does_not_accept_varying_lengths(self): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) schema = pa.schema([pa.field('strings', pa.binary(3))]) with pytest.raises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema)
def __init__(self, width): self._width = width pa.PyExtensionType.__init__(self, pa.binary(width))
STR_TYPE_ARROW_TYPE_MAP = { 'int8': pa.int8(), 'int16': pa.int16(), 'int32': pa.int32(), 'int64': pa.int64(), 'uint8': pa.uint8(), 'uint16': pa.uint16(), 'uint32': pa.uint32(), 'uint64': pa.uint64(), 'float32': pa.float32(), 'float64': pa.float64(), 'double': pa.float64(), 'half_float': pa.float16(), 'string': pa.string(), 'binary': pa.binary(), 'bool': pa.bool_(), 'float': pa.float32(), 'int': pa.int32(), 'str': pa.string() } def _get_arrow_type_from_python_type(python_type): try: return PYTHON_TYPE_ARROW_TYPE_MAP[python_type] except KeyError: return None def _get_arrow_type_from_str_type(cylon_str_type):
def test_convert_options(): cls = ConvertOptions opts = cls() check_options_class(cls, check_utf8=[True, False], strings_can_be_null=[False, True], include_columns=[[], ['def', 'abc']], include_missing_columns=[False, True], auto_dict_encode=[False, True], timestamp_parsers=[[], [ISO8601, '%y-%m']]) assert opts.auto_dict_max_cardinality > 0 opts.auto_dict_max_cardinality = 99999 assert opts.auto_dict_max_cardinality == 99999 assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] assert opts.timestamp_parsers == [] opts.timestamp_parsers = [ISO8601] assert opts.timestamp_parsers == [ISO8601] opts = cls(column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], auto_dict_max_cardinality=999, timestamp_parsers=[ISO8601, '%Y-%m-%d']) assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.auto_dict_max_cardinality == 999 assert opts.timestamp_parsers == [ISO8601, '%Y-%m-%d']
all_array_types = [ ('bool', [True, False, False, True, True]), ('uint8', np.arange(5)), ('int8', np.arange(5)), ('uint16', np.arange(5)), ('int16', np.arange(5)), ('uint32', np.arange(5)), ('int32', np.arange(5)), ('uint64', np.arange(5, 10)), ('int64', np.arange(5, 10)), ('float', np.arange(0, 0.5, 0.1)), ('double', np.arange(0, 0.5, 0.1)), ('string', ['a', 'b', None, 'ddd', 'ee']), ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [{ 'a': 1, 'b': 2 }, None, { 'a': 3, 'b': 4 }, None, { 'a': 5, 'b': 6 }]), ] exported_functions = [
""" A DoFn that coverts a batch of features into an Arrow table.""" import apache_beam as beam import pyarrow as pa from typing import Dict, List, Mapping, Union from tensorflow_metadata.proto.v0 import schema_pb2 from tensorflow_metadata.proto.v0 import statistics_pb2 _ARROW_TYPE_MAP = { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.list_(pa.int64()), ColumnType.FLOAT: pa.list_(pa.float32()), ColumnType.STRING: pa.list_(pa.binary()), } SimpleFeatureList = List[Union[int, str, float, bool]] ColumnName = Union[bytes, Text] @beam.typehints.with_input_types(List[SimpleFeatureList]) @beam.typehints.with_output_types(pa.RecordBatch) class BatchedFeatureListsToRecordBatch(beam.DoFn): """A DoFn to convert a batch of input instances in a feature list format to an Arrow table. """ def __init__( self,
def __init__(self): pa.PyExtensionType.__init__(self, pa.binary(16))
def test_array_mixed_unicode_bytes(): check_array_mixed_unicode_bytes(pa.binary(), pa.string()) check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
def test_ext_array_errors(): ty = ParamExtType(4) storage = pa.array([b"foo", b"bar"], type=pa.binary(3)) with pytest.raises(TypeError, match="Incompatible storage type"): pa.ExtensionArray.from_storage(ty, storage)
def recurse(tpe, nullable): if isinstance(tpe, pyarrow.lib.DictionaryType): out = recurse(tpe.dictionary.type, nullable) if nullable: return awkward.type.OptionType(out) else: return out elif isinstance(tpe, pyarrow.lib.StructType): out = None for i in range(tpe.num_children): x = awkward.type.ArrayType( tpe[i].name, recurse(tpe[i].type, tpe[i].nullable)) if out is None: out = x else: out = out & x if nullable: return awkward.type.OptionType(out) else: return out elif isinstance(tpe, pyarrow.lib.ListType): out = awkward.type.ArrayType(float("inf"), recurse(tpe.value_type, nullable)) if nullable: return awkward.type.OptionType(out) else: return out elif isinstance(tpe, pyarrow.lib.UnionType): out = None for i in range(tpe.num_children): x = recurse(tpe[i].type, nullable) if out is None: out = x else: out = out | x if nullable: return awkward.type.OptionType(out) else: return out elif tpe == pyarrow.string(): if nullable: return awkward.type.OptionType(str) else: return str elif tpe == pyarrow.binary(): if nullable: return awkward.type.OptionType(bytes) else: return bytes elif tpe == pyarrow.bool_(): out = awkward.numpy.dtype(bool) if nullable: return awkward.type.OptionType(out) else: return out elif isinstance(tpe, pyarrow.lib.DataType): if nullable: return awkward.type.OptionType(tpe.to_pandas_dtype()) else: return tpe.to_pandas_dtype() else: raise NotImplementedError(repr(tpe))
def test_ext_array_to_pylist(): ty = ParamExtType(3) storage = pa.array([b"foo", b"bar", None], type=pa.binary(3)) arr = pa.ExtensionArray.from_storage(ty, storage) assert arr.to_pylist() == [b"foo", b"bar", None]
def test_fixed_size_binary_byte_width(): ty = pa.binary(5) assert ty.byte_width == 5
def test_fixed_size_bytes_does_not_accept_varying_lengths(): data = [b'foo', None, b'barb', b'2346'] with pytest.raises(pa.ArrowInvalid): pa.array(data, type=pa.binary(4))
class KmvSketchTest(parameterized.TestCase): @parameterized.named_parameters( ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()), ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()), ("string", ["a", "a", "b", "c", None], pa.string()), ("large_string", ["a", "a", "b", "c"], pa.large_string()), ("int8", [1, 1, 2, 3, None], pa.int8()), ("int16", [1, 1, 2, 3], pa.int16()), ("int32", [1, 1, 2, 3, None], pa.int32()), ("int64", [1, 1, 2, 3], pa.int64()), ("uint8", [1, 1, 2, 3], pa.uint8()), ("uint16", [1, None, 1, 2, 3], pa.uint16()), ("uint32", [1, 1, 2, 3], pa.uint32()), ("uint64", [1, 1, 2, 3, None], pa.uint64()), ) def test_add(self, values, type_): sketch = _create_basic_sketch(pa.array(values, type=type_)) num_unique = sketch.Estimate() self.assertEqual(3, num_unique) def test_add_unsupported_type(self): values = pa.array([True, False], pa.bool_()) sketch = sketches.KmvSketch(_NUM_BUCKETS) with self.assertRaisesRegex(RuntimeError, "Unimplemented: bool"): sketch.AddValues(values) def test_merge(self): sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) sketch2 = _create_basic_sketch(pa.array(["d", "a"])) sketch1.Merge(sketch2) num_unique = sketch1.Estimate() self.assertEqual(4, num_unique) def test_merge_error(self): sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) sketch2 = _create_basic_sketch(pa.array(["d", "a"]), num_buckets=64) with self.assertRaisesRegex( Exception, "Both sketches must have the same number of buckets"): sketch1.Merge(sketch2) def test_picklable(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) pickled = pickle.dumps(sketch, 2) self.assertIsInstance(pickled, bytes) unpickled = pickle.loads(pickled) self.assertIsInstance(unpickled, sketches.KmvSketch) num_unique = unpickled.Estimate() self.assertEqual(3, num_unique) def test_serialization(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) serialized = sketch.Serialize() self.assertIsInstance(serialized, bytes) deserialized = sketches.KmvSketch.Deserialize(serialized) self.assertIsInstance(deserialized, sketches.KmvSketch) num_unique = deserialized.Estimate() self.assertEqual(3, num_unique)
], "type_schema": OrderedDict([ ("a", int), ("b", float), ("c", str), ("d", np.ndarray), ("e", bytes), ]), "pyarrow_schema": pa.schema([ ("a", pa.int64()), ("b", pa.float64()), ("c", pa.string()), ("d", pa.list_(pa.int64())), ("e", pa.binary()), ]) if pa is not None else None, "avro_schema": { "namespace": "example.avro", "name": "User", "type": "record", "fields": [ { "name": "a", "type": "int" }, { "name": "b",
def dataframe_with_lists(include_index=False, parquet_compatible=False): """ Dataframe with list columns of every possible primitive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. parquet_compatible: bool Exclude types not supported by parquet """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ ["1", "รค"], None, ["1"], ["1", "2", "3"], [], ] date_data = [ [], [date(2018, 1, 1), date(2032, 12, 30)], [date(2000, 6, 7)], None, [date(1969, 6, 9), date(1972, 7, 3)] ] time_data = [ [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], [], [time(22, 5, 59)], None, [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] ] temporal_pairs = [ (pa.date32(), date_data), (pa.date64(), date_data), (pa.time32('s'), time_data), (pa.time32('ms'), time_data), (pa.time64('us'), time_data) ] if not parquet_compatible: temporal_pairs += [ (pa.time64('ns'), time_data), ] for value_type, data in temporal_pairs: field_name = '{}_list'.format(value_type) field_type = pa.list_(value_type) field = pa.field(field_name, field_type) fields.append(field) arrays[field_name] = data if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
"FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances. } else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER def bq_to_arrow_struct_data_type(field):
import os import sys import cx_Oracle import pandas as pd import pyarrow as pa import pyarrow.parquet as pq # cx_Oracle => PyArrow type map type_map = { cx_Oracle.DB_TYPE_BFILE: pa.binary(), cx_Oracle.DB_TYPE_BINARY_DOUBLE: pa.float64(), cx_Oracle.DB_TYPE_BINARY_FLOAT: pa.float64(), cx_Oracle.DB_TYPE_BLOB: pa.binary(), cx_Oracle.DB_TYPE_CHAR: pa.string(), cx_Oracle.DB_TYPE_CLOB: pa.binary(), # cx_Oracle.DB_TYPE_CURSOR cx_Oracle.DB_TYPE_DATE: pa.timestamp('ms'), # cx_Oracle.DB_TYPE_INTERVAL_DS cx_Oracle.DB_TYPE_LONG: pa.string(), cx_Oracle.DB_TYPE_LONG_RAW: pa.binary(), cx_Oracle.DB_TYPE_NCHAR: pa.string(), cx_Oracle.DB_TYPE_NCLOB: pa.binary(), # cx_Oracle.DB_TYPE_NUMBER: pa.float64(), # could reflect on precision/scale cx_Oracle.DB_TYPE_NVARCHAR: pa.string(), # cx_Oracle.DB_TYPE_OBJECT cx_Oracle.DB_TYPE_RAW: pa.binary(), cx_Oracle.DB_TYPE_ROWID: pa.string(), cx_Oracle.DB_TYPE_TIMESTAMP: pa.timestamp('ms'), cx_Oracle.DB_TYPE_TIMESTAMP_LTZ: pa.timestamp('ms'),
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] assert opts.include_columns == [] opts.include_columns = ['def', 'abc'] assert opts.include_columns == ['def', 'abc'] assert opts.include_missing_columns is False opts.include_missing_columns = True assert opts.include_missing_columns is True opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True, include_columns=['abc', 'def'], include_missing_columns=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True assert opts.include_columns == ['abc', 'def'] assert opts.include_missing_columns is True
# consolidate with the logic from the parquet backend _to_ibis_dtypes = { pa.int8(): dt.Int8, pa.int16(): dt.Int16, pa.int32(): dt.Int32, pa.int64(): dt.Int64, pa.uint8(): dt.UInt8, pa.uint16(): dt.UInt16, pa.uint32(): dt.UInt32, pa.uint64(): dt.UInt64, pa.float16(): dt.Float16, pa.float32(): dt.Float32, pa.float64(): dt.Float64, pa.string(): dt.String, pa.binary(): dt.Binary, pa.bool_(): dt.Boolean, } @dt.dtype.register(pa.DataType) def from_pyarrow_primitive(arrow_type, nullable=True): return _to_ibis_dtypes[arrow_type](nullable=nullable) @dt.dtype.register(pa.TimestampType) def from_pyarrow_timestamp(arrow_type, nullable=True): return dt.TimestampType(timezone=arrow_type.tz) @sch.infer.register(pa.Schema)
def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) assert isinstance(s, scalar_typ) assert s.as_py() == value assert s.as_py() != 'something' assert repr(value) in repr(s) assert str(s) == str(value) buf = s.as_buffer() assert isinstance(buf, pa.Buffer) assert buf.to_pybytes() == value.encode() @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [(pa.binary(), pa.BinaryScalar), (pa.large_binary(), pa.LargeBinaryScalar)]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) assert isinstance(s, scalar_typ) assert s.as_py() == value assert str(s) == str(value) assert repr(value) in repr(s) assert s.as_py() == value assert s != b'xxxxx' buf = s.as_buffer() assert isinstance(buf, pa.Buffer) assert buf.to_pybytes() == value