def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def __init__(self, values): if not isinstance(values, pa.ChunkedArray): raise ValueError assert values.type == pa.bool_() self._data = values self._dtype = ArrowBoolDtype()
def test_sequence_numpy_boolean(seq): expected = [np.bool(True), None, np.bool(False), None] arr = pa.array(seq(expected)) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pa.bool_() assert arr.to_pylist() == expected
def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), (6, 'bar', False)] expected = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values data = [(5, 'foo', None), None, (6, None, False)] expected = [{'a': 5, 'b': 'foo', 'c': None}, None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([tup], type=ty)
def test_boolean(self): expected = [True, None, False, None] arr = pyarrow.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pyarrow.bool_() assert arr.to_pylist() == expected
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [{'a': 5, 'b': u'foo', 'c': True}, {'a': 6, 'b': u'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': u'bar', 'c': None}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def test_boolean_no_nulls(self): num_values = 100 np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) field = A.Field.from_py('bools', A.bool_()) schema = A.Schema.from_fields([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_boolean_no_nulls(self): num_values = 100 np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_mixed_sequence_errors(): with pytest.raises(ValueError, match="tried to convert to boolean"): pa.array([True, 'foo'], type=pa.bool_()) with pytest.raises(ValueError, match="tried to convert to float32"): pa.array([1.5, 'foo'], type=pa.float32()) with pytest.raises(ValueError, match="tried to convert to double"): pa.array([1.5, 'foo'])
def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [(5, 'foo', True), {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty)
def field(jvm_field): """ Construct a Field from a org.apache.arrow.vector.types.pojo.Field instance. Parameters ---------- jvm_field: org.apache.arrow.vector.types.pojo.Field Returns ------- pyarrow.Field """ name = jvm_field.getName() jvm_type = jvm_field.getType() typ = None if not jvm_type.isComplex(): type_str = jvm_type.getTypeID().toString() if type_str == 'Null': typ = pa.null() elif type_str == 'Int': typ = _from_jvm_int_type(jvm_type) elif type_str == 'FloatingPoint': typ = _from_jvm_float_type(jvm_type) elif type_str == 'Utf8': typ = pa.string() elif type_str == 'Binary': typ = pa.binary() elif type_str == 'FixedSizeBinary': typ = pa.binary(jvm_type.getByteWidth()) elif type_str == 'Bool': typ = pa.bool_() elif type_str == 'Time': typ = _from_jvm_time_type(jvm_type) elif type_str == 'Timestamp': typ = _from_jvm_timestamp_type(jvm_type) elif type_str == 'Date': typ = _from_jvm_date_type(jvm_type) elif type_str == 'Decimal': typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale()) else: raise NotImplementedError( "Unsupported JVM type: {}".format(type_str)) else: # TODO: The following JVM types are not implemented: # Struct, List, FixedSizeList, Union, Dictionary raise NotImplementedError( "JVM field conversion only implemented for primitive types.") nullable = jvm_field.isNullable() if jvm_field.getMetadata().isEmpty(): metadata = None else: metadata = dict(jvm_field.getMetadata()) return pa.field(name, typ, nullable, metadata)
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
def test_column_types(self): # Ask for specific column types in ConvertOptions opts = ConvertOptions(column_types={'b': 'float32', 'c': 'string', 'd': 'boolean', 'zz': 'null'}) rows = b"a,b,c,d\n1,2,3,true\n4,-5,6,false\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.int64()), ('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_())]) expected = { 'a': [1, 4], 'b': [2.0, -5.0], 'c': ["3", "6"], 'd': [True, False], } assert table.schema == schema assert table.to_pydict() == expected # Pass column_types as schema opts = ConvertOptions( column_types=pa.schema([('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_()), ('zz', pa.bool_())])) table = self.read_bytes(rows, convert_options=opts) assert table.schema == schema assert table.to_pydict() == expected # One of the columns in column_types fails converting rows = b"a,b,c,d\n1,XXX,3,true\n4,-5,6,false\n" with pytest.raises(pa.ArrowInvalid) as exc: self.read_bytes(rows, convert_options=opts) err = str(exc.value) assert "In column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err
def test_simple_varied(self): # Infer various kinds of data rows = b"a,b,c,d\n1,2,3,0\n4.0,-5,foo,True\n" table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.bool_())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, 4.0], 'b': [2, -5], 'c': [u"3", u"foo"], 'd': [False, True], }
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_simple_varied(self): # Infer various kinds of data rows = (b'{"a": 1,"b": 2, "c": "3", "d": false}\n' b'{"a": 4.0, "b": -5, "c": "foo", "d": true}\n') table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.bool_())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, 4.0], 'b': [2, -5], 'c': [u"3", u"foo"], 'd': [False, True], }
def test_structarray(self): ints = pa.array([None, 2, 3], type=pa.int64()) strs = pa.array([u'a', None, u'c'], type=pa.string()) bools = pa.array([True, False, None], type=pa.bool_()) arr = pa.StructArray.from_arrays( ['ints', 'strs', 'bools'], [ints, strs, bools]) expected = pd.Series([ {'ints': None, 'strs': u'a', 'bools': True}, {'ints': 2, 'strs': None, 'bools': False}, {'ints': 3, 'strs': u'c', 'bools': None}, ]) series = pd.Series(arr.to_pandas()) tm.assert_series_equal(series, expected)
def test_structarray(): ints = pa.array([None, 2, 3], type=pa.int64()) strs = pa.array([u'a', None, u'c'], type=pa.string()) bools = pa.array([True, False, None], type=pa.bool_()) arr = pa.StructArray.from_arrays( [ints, strs, bools], ['ints', 'strs', 'bools']) expected = [ {'ints': None, 'strs': u'a', 'bools': True}, {'ints': 2, 'strs': None, 'bools': False}, {'ints': 3, 'strs': u'c', 'bools': None}, ] pylist = arr.to_pylist() assert pylist == expected, (pylist, expected)
def test_table_flatten(): ty1 = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) ty2 = pa.struct([pa.field('nest', ty1)]) a = pa.array([(1, 2.5), (3, 4.5)], type=ty1) b = pa.array([((11, 12.5),), ((13, 14.5),)], type=ty2) c = pa.array([False, True], type=pa.bool_()) table = pa.Table.from_arrays([a, b, c], names=['a', 'b', 'c']) t2 = table.flatten() t2._validate() expected = pa.Table.from_arrays([ pa.array([1, 3], type=pa.int16()), pa.array([2.5, 4.5], type=pa.float32()), pa.array([(11, 12.5), (13, 14.5)], type=ty1), c], names=['a.x', 'a.y', 'b.nest', 'c']) assert t2.equals(expected)
def test_simple_nulls(self): # Infer various kinds of data, with nulls rows = (b'{"a": 1, "b": 2, "c": null, "d": null, "e": null}\n' b'{"a": null, "b": -5, "c": "foo", "d": null, "e": true}\n' b'{"a": 4.5, "b": null, "c": "nan", "d": null,"e": false}\n') table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.null()), ('e', pa.bool_())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, None, 4.5], 'b': [2, -5, None], 'c': [None, u"foo", u"nan"], 'd': [None, None, None], 'e': [None, True, False], }
def test_custom_bools(self): # Infer booleans with custom values opts = ConvertOptions(true_values=['T', 'yes'], false_values=['F', 'no']) rows = (b"a,b,c\n" b"True,T,t\n" b"False,F,f\n" b"True,yes,yes\n" b"False,no,no\n" b"N/A,N/A,N/A\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.bool_()), ('c', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': ["True", "False", "True", "False", "N/A"], 'b': [True, False, True, False, None], 'c': ["t", "f", "yes", "no", "N/A"], }
def test_simple_nulls(self): # Infer various kinds of data, with nulls rows = (b"a,b,c,d,e,f\n" b"1,2,,,3,N/A\n" b"nan,-5,foo,,nan,TRUE\n" b"4.5,#N/A,nan,,\xff,false\n") table = self.read_bytes(rows) schema = pa.schema([('a', pa.float64()), ('b', pa.int64()), ('c', pa.string()), ('d', pa.null()), ('e', pa.binary()), ('f', pa.bool_())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1.0, None, 4.5], 'b': [2, -5, None], 'c': [u"", u"foo", u"nan"], 'd': [None, None, None], 'e': [b"3", b"nan", b"\xff"], 'f': [None, True, False], }
def test_boolean_nulls(self): # pandas requires upcast to object dtype num_values = 100 np.random.seed(0) mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 arr = pa.array(values, mask=mask) expected = values.astype(object) expected[mask] = None field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) ex_frame = pd.DataFrame({'bools': expected}) table = pa.Table.from_arrays([arr], ['bools']) assert table.schema.equals(schema) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame)
def test_struct_from_dicts(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) arr = pa.array([], type=ty) assert arr.to_pylist() == [] data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}] arr = pa.array(data, type=ty) expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': 'bar', 'c': None}] assert arr.to_pylist() == expected
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def make_sorted_groups(sorting_table: pa.Table, input_table: pa.Table) -> SortedGroups: if not sorting_table.num_columns: # Exactly one output group, even for empty-table input return SortedGroups( sorted_groups=pa.table({ "A": [None] }).select([]), # 1-row, 0-col table sorted_input_table= input_table, # everything is one group (maybe 0-row) group_splits=np.array([], np.int64()), ) # pyarrow 3.0.0 can't sort dictionary columns. # TODO make sort-dictionary work; nix this conversion sorting_table_without_dictionary = pa.table( [ column.cast(pa.utf8()) if pa.types.is_dictionary(column.type) else column for column in sorting_table.columns ], schema=pa.schema([ pa.field(field.name, pa.utf8()) if pa.types.is_dictionary(field.type) else field for field in [ sorting_table.schema.field(i) for i in range(len(sorting_table.schema.names)) ] ]), ) indices = pa.compute.sort_indices( sorting_table_without_dictionary, sort_keys=[(c, "ascending") for c in sorting_table_without_dictionary.column_names], ) sorted_groups_with_dups_and_nulls = sorting_table.take(indices) # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that # contain NULL. This is mathematically sound for Pandas' "NA" (because if # all these unknown things are the same thing, doesn't that mean we know # something about them? -- reducto ad absurdum, QED). But Workbench's NULL # is a bit closer to SQL NULL, which means "whatever you say, pal". # # This null-dropping is for backwards compat. TODO make it optional ... and # eventually nix the option and always output NULL groups. nonnull_indices = indices.filter( find_nonnull_table_mask(sorted_groups_with_dups_and_nulls)) if input_table.num_columns: sorted_input_table = input_table.take(nonnull_indices) else: # Don't .take() on a zero-column Arrow table: its .num_rows would change # # All rows are identical, so .slice() gives the table we want sorted_input_table = input_table.slice(0, len(nonnull_indices)) sorted_groups_with_dups = sorting_table.take(nonnull_indices) # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to # the row before it. (The first value compares the first and second row.) # # We start assuming all are equal; then we search for inequality if len(sorted_groups_with_dups): is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1), pa.bool_()) for column in sorted_groups_with_dups.itercolumns(): chunk = column.chunks[0] if pa.types.is_dictionary(chunk.type): chunk = chunk.indices first = chunk.slice(0, len(column) - 1) second = chunk.slice(1) # TODO when we support NULL groups: # both_null = pa.compute.and_(first.is_null(), second.is_null()) # both_equal_if_not_null = pa.compute.equal(first, second) # both_equal = pa.compute.fill_null(both_equal_if_not_null, False) # value_is_dup = pa.compute.or_(both_null, both_equal) # ... and for now, it's simply: value_is_dup = pa.compute.equal(first, second) is_dup = pa.compute.and_(is_dup, value_is_dup) group_splits = np.where(~(is_dup.to_numpy( zero_copy_only=False)))[0] + 1 sorted_groups = reencode_dictionaries( sorted_groups_with_dups.take(np.insert(group_splits, 0, 0))) else: sorted_groups = sorted_groups_with_dups group_splits = np.array([], np.int64()) return SortedGroups( sorted_groups=sorted_groups, sorted_input_table=sorted_input_table, group_splits=group_splits, )
def test_sequence_mixed_numpy_python_bools(seq): values = np.array([True, False]) arr = pa.array(seq([values[0], None, values[1], True, False])) assert arr.type == pa.bool_() assert arr.to_pylist() == [True, None, False, True, False]
_pyarrow_wrappers.Array_get_address, ctypes.c_double, addr_func_name="Array_get_address_c", override_module_name="katana.numba_support._pyarrow_wrappers", ) ###### Wrap chunked Arrow arrays for Numba _array_type_map = { pyarrow.int64(): pyarrow.Int64Array, pyarrow.int32(): pyarrow.Int32Array, pyarrow.uint64(): pyarrow.UInt64Array, pyarrow.uint32(): pyarrow.UInt32Array, pyarrow.float64(): pyarrow.lib.DoubleArray, pyarrow.float32(): pyarrow.lib.FloatArray, pyarrow.bool_(): pyarrow.lib.BooleanArray, } _type_array_map = {a: t for t, a in _array_type_map.items()} _arrow_ctypes_map = { pyarrow.int64(): ctypes.c_int64, pyarrow.int32(): ctypes.c_int32, pyarrow.uint64(): ctypes.c_uint64, pyarrow.uint32(): ctypes.c_uint32, pyarrow.float64(): ctypes.c_double, pyarrow.float32(): ctypes.c_float, pyarrow.bool_(): ctypes.c_bool, }
import pytz import hypothesis as h import hypothesis.strategies as st import hypothesis.extra.numpy as npst import hypothesis.extra.pytz as tzst import numpy as np import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
try: import cython CYTHON = True except: CYTHON = False try: import pyarrow as pa from pyarrow import csv import numpy as np ARROW = True except: ARROW = False else: sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } __version__ = '3.0.3' WIN = True if sys.platform in ('win32', 'cygwin') else False
('uint16', range(0, 10)), ('int16', range(0, 10)), ('uint32', range(0, 10)), ('int32', range(0, 10)), ('uint64', range(0, 10)), ('int64', range(0, 10)), ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde'])]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) @pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): result = pickle.loads(pickle.dumps(array, proto))
expected = pa.array([0, 1, 2], type='i8') result = arr.cast('i8') assert result.equals(expected) def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_conversions_no_sentinel_values():
np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.bool_(): pd.BooleanDtype(), pa.string(): pd.StringDtype(), } pandas_dtypes_to_cudf_dtypes = { pd.UInt8Dtype(): np.dtype("uint8"), pd.UInt16Dtype(): np.dtype("uint16"), pd.UInt32Dtype(): np.dtype("uint32"), pd.UInt64Dtype(): np.dtype("uint64"), pd.Int8Dtype(): np.dtype("int8"), pd.Int16Dtype(): np.dtype("int16"), pd.Int32Dtype(): np.dtype("int32"), pd.Int64Dtype(): np.dtype("int64"), pd.BooleanDtype(): np.dtype("bool_"), pd.StringDtype(): np.dtype("object"), }
new_index = ExplicitSecondaryIndex(column="another_col", index_dct={1: ["part_4"]}) with pytest.raises(ValueError) as e: original_index.update(new_index) assert ( str(e.value) == "Trying to update an index with the wrong column. Got `another_col` but expected `col`" ) @pytest.mark.parametrize( "dtype", [ pa.binary(), pa.bool_(), pa.date32(), pa.float32(), pa.float64(), pa.int64(), pa.int8(), pa.string(), pa.timestamp("ns"), ], ) def test_index_empty(store, dtype): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex(column="col", index_dct={}, dtype=dtype, index_storage_key=storage_key)
pa.field("visit_id", pa.int64()), pa.field("instance_id", pa.uint32(), nullable=False), pa.field("extension_session_uuid", pa.string()), pa.field("event_ordinal", pa.int64()), pa.field("window_id", pa.int64()), pa.field("tab_id", pa.int64()), pa.field("frame_id", pa.int64()), pa.field("url", pa.string(), nullable=False), pa.field("top_level_url", pa.string()), pa.field("parent_frame_id", pa.int64()), pa.field("frame_ancestors", pa.string()), pa.field("method", pa.string(), nullable=False), pa.field("referrer", pa.string(), nullable=False), pa.field("headers", pa.string(), nullable=False), pa.field("request_id", pa.int64(), nullable=False), pa.field("is_XHR", pa.bool_()), pa.field("is_third_party_channel", pa.bool_()), pa.field("is_third_party_to_top_window", pa.bool_()), pa.field("triggering_origin", pa.string()), pa.field("loading_origin", pa.string()), pa.field("loading_href", pa.string()), pa.field("req_call_stack", pa.string()), pa.field("resource_type", pa.string(), nullable=False), pa.field("post_body", pa.string()), pa.field("post_body_raw", pa.string()), pa.field("time_stamp", pa.string(), nullable=False), ] PQ_SCHEMAS["http_requests"] = pa.schema(fields) # http_responses fields = [
], ["x", "y", "z"]), examples_text_proto=_ENCODE_TEST_EXAMPLES), dict(record_batch=pa.RecordBatch.from_arrays([ pa.array([None, None, [b"a", b"b"]], type=pa.large_list(pa.binary())), pa.array([None, None, [1.0, 2.0]], type=pa.large_list(pa.float32())), pa.array([None, None, [4, 5]], type=pa.list_(pa.int64())) ], ["x", "y", "z"]), examples_text_proto=list(reversed(_ENCODE_TEST_EXAMPLES[:-1]))), ] _INVALID_ENCODE_TYPE_CASES = [ dict(record_batch=pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], ["a"]), error=RuntimeError, error_msg_regex="Expected ListArray or LargeListArray"), dict(record_batch=pa.RecordBatch.from_arrays( [pa.array([[True], [False]], type=pa.large_list(pa.bool_()))], ["a"]), error=RuntimeError, error_msg_regex="Bad field type"), dict(record_batch=pa.RecordBatch.from_arrays([ pa.array([[b"a", b"b"], None, None, []], type=pa.large_list(pa.large_binary())), pa.array([[1.0, 2.0], None, None, []], type=pa.large_list(pa.float32())), ], ["x", "x"]), error=RuntimeError, error_msg_regex="RecordBatch contains duplicate column names") ] class RecordBatchToExamplesTest(parameterized.TestCase): @parameterized.parameters(*_ENCODE_CASES)
def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) field = pa.Field.from_py('bools', pa.bool_()) schema = pa.Schema.from_fields([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
('int64', range(0, 10)), ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), ('binary', [b'a', b'b', b'c']), (pa.binary(3), [b'abc', b'bcd', b'cde']) ]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) assert arr.cast(ty).equals(arr) pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) @pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated.
b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate()
}, "time": { "type": "long", "logicalType": "time-micros" }, "timestamp": { "type": "long", "logicalType": "timestamp-micros" }, } # This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py # When modifying it be sure to update it there as well. BQ_TO_ARROW_TYPES = { "int64": pyarrow.int64(), "float64": pyarrow.float64(), "bool": pyarrow.bool_(), "numeric": pyarrow.decimal128(38, 9), "string": pyarrow.utf8(), "bytes": pyarrow.binary(), "date": pyarrow.date32(), # int32 days since epoch "datetime": pyarrow.timestamp("us"), "time": pyarrow.time64("us"), "timestamp": pyarrow.timestamp("us", tz="UTC"), } SCALAR_COLUMNS = [ { "name": "int_col", "type": "int64" }, { "name": "float_col",
def test_boolean(self): arr = pyarrow.from_pylist([True, None, False, None]) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pyarrow.bool_()
import six from pandas.api.types import ( is_array_like, is_bool_dtype, is_int64_dtype, is_integer, is_integer_dtype, ) from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ExtensionDtype from ._algorithms import all_op, any_op, extract_isnull_bytemap _python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type,
def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema)
from absl.testing import absltest from absl.testing import parameterized _MERGE_TEST_CASES = [ dict( testcase_name="empty_input", inputs=[], expected_output=dict(), ), dict( testcase_name="basic_types", inputs=[ { "bool": pa.array([False, None, True], type=pa.bool_()), "int64": pa.array([1, None, 3], type=pa.int64()), "uint64": pa.array([1, None, 3], type=pa.uint64()), "int32": pa.array([1, None, 3], type=pa.int32()), "uint32": pa.array([1, None, 3], type=pa.uint32()), "float": pa.array([1., None, 3.], type=pa.float32()), "double": pa.array([1., None, 3.], type=pa.float64()), "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()), "large_bytes": pa.array([b"abc", None, b"ghi"], type=pa.large_binary()), "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()), "large_unicode": pa.array([u"abc", None, u"ghi"], type=pa.large_utf8()), }, { "bool": pa.array([None, False], type=pa.bool_()),
def get_type_and_builtins(self, n, type_name): """ Return a `(arrow type, list)` tuple where the arrow type corresponds to the given logical *type_name*, and the list is a list of *n* random-generated Python objects compatible with the arrow type. """ size = None if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' elif type_name.startswith('struct'): kind = 'struct' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): kind = 'fixed binary' size = int(type_name[6:]) assert size > 0 else: raise ValueError("unrecognized type %r" % (type_name,)) if kind in ('int', 'float'): ty = getattr(pa, type_name)() elif kind == 'bool': ty = pa.bool_() elif kind == 'decimal': ty = pa.decimal128(9, 9) elif kind == 'fixed binary': ty = pa.binary(size) elif kind == 'varying binary': ty = pa.binary() elif kind in ('ascii', 'unicode'): ty = pa.string() elif kind == 'int64 list': ty = pa.list_(pa.int64()) elif kind == 'struct': ty = pa.struct([pa.field('u', pa.int64()), pa.field('v', pa.float64()), pa.field('w', pa.bool_())]) factories = { 'int': self.generate_int_list, 'float': self.generate_float_list, 'bool': self.generate_bool_list, 'decimal': self.generate_decimal_list, 'fixed binary': partial(self.generate_fixed_binary_list, size=size), 'varying binary': partial(self.generate_varying_binary_list, min_size=3, max_size=40), 'ascii': partial(self.generate_ascii_string_list, min_size=3, max_size=40), 'unicode': partial(self.generate_unicode_string_list, min_size=3, max_size=40), 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, 'struct from tuples': self.generate_tuple_list, } data = factories[kind](n) return ty, data
import argparse import csv import re import sys from datetime import datetime from base64 import standard_b64decode import pyarrow as pa import pyarrow.parquet as pq PA_BOOL = pa.bool_() PA_FLOAT32 = pa.float32() PA_FLOAT64 = pa.float64() PA_INT8 = pa.int8() PA_INT16 = pa.int16() PA_INT32 = pa.int32() PA_INT64 = pa.int64() PA_STRING = pa.string() PA_TIMESTAMP = pa.timestamp('ns') PA_BINARY = pa.binary() def get_delimiter(csv_file, custom_delimiter=','): if csv_file[-4:] == '.tsv': return '\t' return custom_delimiter if custom_delimiter else ',' def sanitize_column_name(name): cleaned = re.sub('[^a-z0-9]', '_', name.lower()) cleaned = re.sub('__*', '_', cleaned) cleaned = re.sub('^_*', '', cleaned)
def test_sequence_numpy_boolean(seq): expected = [np.bool(True), None, np.bool(False), None] arr = pa.array(seq(expected)) assert arr.type == pa.bool_() assert arr.to_pylist() == [True, None, False, None]
) xfail_bool_too_few_uniques = pytest.mark.xfail_by_type_filter( [pa.types.is_boolean], "Test requires at least 3 unique values") test_types = [ FletcherTestType( pa.string(), ["🙈", "Ö", "Č", "a", "B"] * 20, [None, "A"], ["B", "B", None, None, "A", "A", "B", "C"], ["B", "C", "A"], ["B", None, "A"], lambda: choices(list(string.ascii_letters), k=10), ), FletcherTestType( pa.bool_(), [True, False, True, True, False] * 20, [None, False], [True, True, None, None, False, False, True, False], [True, False, False], [True, None, False], lambda: choices([True, False], k=10), ), FletcherTestType( pa.int8(), # Use small values here so that np.prod stays in int32 [2, 1, 1, 2, 1] * 20, [None, 1], [2, 2, None, None, -100, -100, 2, 100], [2, 100, -10], [2, None, -10],
import pyarrow as pa schema_fields = [ pa.field("timestamp", pa.date64(), False), pa.field("timezone", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "1024" }), pa.field("vin", pa.uint64(), False), pa.field("odometer", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "1000" }), pa.field("hypermiling", pa.bool_(), False), pa.field("avgspeed", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "200" }), pa.field( "sec_in_band", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({ "illex_MIN": "0", "illex_MAX": "4192" }), 12), False), pa.field( "miles_in_time_range", pa.list_( pa.field("item", pa.uint64(), False).with_metadata({
def csv_to_table(self, csv_path, table_name, read=None, parse=None, convert=None, con=None, auto_infer=False): ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html ' if not ARROW: return "Optional pyarrow dependency not found. To install: pip3 install pyarrow" sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } start = time.time() # Get table metadata con = con or self con.execute(f'select * from {table_name} where 1=0') # Map column names to pyarrow types and set Arrow's CSV parameters sqream_col_types = [col_type[0] for col_type in con.col_type_tups] column_types = zip( con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups]) read = read or csv.ReadOptions(column_names=con.col_names) parse = parse or csv.ParseOptions(delimiter='|') convert = convert or csv.ConvertOptions( column_types=None if auto_infer else column_types) # Read CSV to in-memory arrow format csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks() num_chunks = len(csv_arrow[0].chunks) numpy_cols = [] # For each column, get the numpy representation for quick packing for col_type, col in zip(sqream_col_types, csv_arrow): # Only one chunk after combine_chunks() col = col.chunks[0] if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'): col = col.to_pandas() else: col = col.to_numpy() numpy_cols.append(col) print(f'total loading csv: {time.time()-start}') start = time.time() # Insert columns into SQream col_num = csv_arrow.shape[1] con.executemany( f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols) print(f'total inserting csv: {time.time()-start}')
def test_struct_from_dicts_inference(): expected_type = pa.struct([ pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_()) ]) data = [{ 'a': 5, 'b': u'foo', 'c': True }, { 'a': 6, 'b': u'bar', 'c': False }] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{ 'a': 5, 'b': None, 'c': True }, None, { 'a': None, 'b': None, 'c': None }, { 'a': None, 'b': u'bar', 'c': None }] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field( 'a', pa.struct([ pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_()) ])), pa.field('b', pa.string()) ]) data = [{ 'a': { 'aa': [5, 6], 'ab': True }, 'b': 'foo' }, { 'a': { 'aa': None, 'ab': False }, 'b': None }, { 'a': None, 'b': 'bar' }] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def __init__( self, name: str, values: "Union[np.array, List[Optional[Any]]]" = None, nullable: bool = True, dtype: "Optional[DataType]" = None, ): """ Parameters ---------- name Name of the series values Values of the series nullable If nullable. None values in a list will be interpreted as missing. NaN values in a numpy array will be interpreted as missing. Note that missing and NaNs are not the same in Polars Series creation may be faster if set to False and there are no null values. """ # assume the first input were the values if values is None and not isinstance(name, str): values = name name = "" if values.__class__ == self.__class__: values.rename(name) self._s = values._s return self._s: PySeries # series path if isinstance(values, Series): self._from_pyseries(values) return elif isinstance(values, dict): raise ValueError( f"Constructing a Series with a dict is not supported for {values}" ) elif isinstance(values, pa.Array): self._s = self.from_arrow(name, values)._s return # castable to numpy if not isinstance(values, np.ndarray) and not nullable: values = np.array(values) if dtype is not None: if dtype == Int8: self._s = PySeries.new_i8(name, values) elif dtype == Int16: self._s = PySeries.new_i16(name, values) elif dtype == Int32: self._s = PySeries.new_i32(name, values) elif dtype == Int64: self._s = PySeries.new_i64(name, values) elif dtype == UInt8: self._s = PySeries.new_u8(name, values) elif dtype == UInt16: self._s = PySeries.new_u16(name, values) elif dtype == UInt32: self._s = PySeries.new_u32(name, values) elif dtype == UInt64: self._s = PySeries.new_u64(name, values) elif dtype == Float32: self._s = PySeries.new_f32(name, values) elif dtype == Float64: self._s = PySeries.new_f64(name, values) elif dtype == Boolean: self._s = PySeries.new_bool(name, values) elif dtype == Utf8: self._s = PySeries.new_str(name, values) else: raise ValueError( f"dtype {dtype} not yet supported when creating a Series") return # numpy path if isinstance(values, np.ndarray): if not values.data.contiguous: values = np.array(values) if len(values.shape) > 1: self._s = PySeries.new_object(name, values) return dtype = values.dtype if dtype == np.int64: self._s = PySeries.new_i64(name, values) elif dtype == np.int32: self._s = PySeries.new_i32(name, values) elif dtype == np.int16: self._s = PySeries.new_i16(name, values) elif dtype == np.int8: self._s = PySeries.new_i8(name, values) elif dtype == np.float32: self._s = PySeries.new_f32(name, values, nullable) elif dtype == np.float64: self._s = PySeries.new_f64(name, values, nullable) elif isinstance(values[0], str): self._s = PySeries.new_str(name, values) elif dtype == np.bool: self._s = PySeries.new_bool(name, values) elif dtype == np.uint8: self._s = PySeries.new_u8(name, values) elif dtype == np.uint16: self._s = PySeries.new_u16(name, values) elif dtype == np.uint32: self._s = PySeries.new_u32(name, values) elif dtype == np.uint64: self._s = PySeries.new_u64(name, values) else: self._s = PySeries.new_object(name, values) return # list path else: dtype = _find_first_non_none(values) # order is important as booleans are instance of int in python. if isinstance(dtype, bool): self._s = PySeries.new_opt_bool(name, values) elif isinstance(dtype, int): self._s = PySeries.new_opt_i64(name, values) elif isinstance(dtype, float): self._s = PySeries.new_opt_f64(name, values) elif isinstance(dtype, str): self._s = PySeries.new_str(name, values) # make list array elif isinstance(dtype, (list, tuple)): value_dtype = _find_first_non_none(dtype) # we can expect a failure if we pass `[[12], "foo", 9]` # in that case we catch the exception and create an object type try: if isinstance(value_dtype, bool): arrow_array = pa.array(values, pa.large_list(pa.bool_())) elif isinstance(value_dtype, int): arrow_array = pa.array(values, pa.large_list(pa.int64())) elif isinstance(value_dtype, float): arrow_array = pa.array(values, pa.large_list(pa.float64())) elif isinstance(value_dtype, str): arrow_array = pa.array(values, pa.large_list(pa.large_utf8())) else: self._s = PySeries.new_object(name, values) return self._s = Series.from_arrow(name, arrow_array)._s except pa.lib.ArrowInvalid: self._s = PySeries.new_object(name, values) else: self._s = PySeries.new_object(name, values)
# In the following, we use the JSON serialization of the Field objects in Java. # This ensures that we neither rely on the exact mechanics on how to construct # them using Java code as well as enables us to define them as parameters # without to invoke the JVM. # # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize( 'typ,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'),
def test_is_boolean(): assert types.is_boolean(pa.bool_()) assert not types.is_boolean(pa.int8())
def test_try_type_and_type_forbidden(self): with self.assertRaises(AssertionError): _ = pa.array(TypedSequence([1, 2, 3], try_type=pa.bool_(), type=pa.int64()))
logger = logging.getLogger(__name__) PROJECT_PQ = pa.schema([ pa.field('aha_id', pa.string()), pa.field('reference_prefix', pa.string()), pa.field('name', pa.string()), pa.field('last_release_num', pa.int32()), pa.field('last_feature_num', pa.int32()), pa.field('last_idea_num', pa.int32()), pa.field('position', pa.int32()), pa.field('positioning_customer', pa.string()), pa.field('positioning_problem', pa.string()), pa.field('positioning_benefit1', pa.string()), pa.field('positioning_benefit2', pa.string()), pa.field('positioning_benefit3', pa.string()), pa.field('product_line', pa.bool_()), pa.field('product_line_type', pa.string()), pa.field('capacity_planning_enabled', pa.bool_()), pa.field('ideas_scoring_system_id', pa.string()), pa.field('ideas_default_user_id', pa.string()), pa.field('default_capacity_units', pa.int32()), pa.field('default_feature_remaining_estimate', pa.bool_()), pa.field('last_page_num', pa.int32()), pa.field('color', pa.int32()), pa.field('workflow_screen_enabled', pa.bool_()), pa.field('competitor_scoring_system_id', pa.string()), pa.field('initiative_workflow_id', pa.string()), pa.field('strategic_imperative_workflow_id', pa.string()), pa.field('estimated_time_as_work_done', pa.bool_()), pa.field('last_epic_num', pa.int32()), pa.field('configuration', pa.string()),