def test_data_type(): a = pbn.DiscreteFactor("A", []) with pytest.raises(ValueError) as ex: a.data_type() "DiscreteFactor factor not fitted." in str(ex.value) categories = np.asarray(["a1", "a2"]) a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) df = pd.DataFrame({'A': a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) categories = np.asarray(["a" + str(i) for i in range(1, 129)]) a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) df = pd.DataFrame({'A': a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) categories = np.asarray(["a" + str(i) for i in range(1, 130)]) a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) df = pd.DataFrame({'A': a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int16(), pa.string())
def test_schema_merge(): a = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ]) b = pa.schema([pa.field('foo', pa.int32()), pa.field('qux', pa.bool_())]) c = pa.schema([pa.field('quux', pa.dictionary(pa.int32(), pa.string()))]) d = pa.schema([pa.field('foo', pa.int64()), pa.field('qux', pa.bool_())]) result = pa.unify_schemas([a, b, c]) expected = pa.schema([ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())), pa.field('qux', pa.bool_()), pa.field('quux', pa.dictionary(pa.int32(), pa.string())) ]) assert result.equals(expected) with pytest.raises(pa.ArrowInvalid): pa.unify_schemas([b, d]) # ARROW-14002: Try with tuple instead of list result = pa.unify_schemas((a, b, c)) assert result.equals(expected)
def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())), ('b', pa.dictionary(pa.int32(), pa.int64())), ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))), ('d', pa.dictionary(pa.int32(), pa.large_utf8()))] opts = ConvertOptions(column_types=dict(column_types)) rows = (b"a,b,c,d\n" b"abc,123456,1.0,zz\n" b"defg,123456,0.5,xx\n" b"abc,N/A,1.0,xx\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema(column_types) expected = { 'a': ["abc", "defg", "abc"], 'b': [123456, 123456, None], 'c': [Decimal("1.00"), Decimal("0.50"), Decimal("1.00")], 'd': ["zz", "xx", "xx"], } assert table.schema == schema assert table.to_pydict() == expected # Unsupported index type column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8())) opts = ConvertOptions(column_types=dict(column_types)) with pytest.raises(NotImplementedError): table = self.read_bytes(rows, convert_options=opts)
def test_table(n, types=None, offset=None, length=None, nullable=True): if types is None: types = [ pyarrow.null(), pyarrow.bool_(), pyarrow.int8(), pyarrow.int16(), pyarrow.int32(), pyarrow.int64(), pyarrow.uint8(), pyarrow.uint16(), pyarrow.uint32(), pyarrow.uint64(), pyarrow.float16(), pyarrow.float32(), pyarrow.float64(), pyarrow.date32(), pyarrow.date64(), pyarrow.timestamp('s'), pyarrow.timestamp('ms'), pyarrow.timestamp('us'), pyarrow.timestamp('ns'), pyarrow.time32('s'), pyarrow.time32('ms'), pyarrow.time64('us'), pyarrow.time64('ns'), pyarrow.string(), pyarrow.binary(), pyarrow.binary(4), pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True), pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True), pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False), pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False), pyarrow.list_(pyarrow.int32()), pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]), pyarrow.list_( pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])), pyarrow.struct( [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]), ] data = list() for t in types: name = str(t) array = TestArrayGenerator(n, t, False).array if offset is not None: array = array.slice(offset, length) data.append(pyarrow.column(name, array)) if nullable: name = str(t) + ' (null)' array = TestArrayGenerator(n, t, True).array if offset is not None: array = array.slice(offset, length) data.append(pyarrow.column(name, array)) return pyarrow.Table.from_arrays(data)
def test_dictionary_ordered_equals(): # Python side checking of ARROW-6345 d1 = pa.dictionary('int32', 'binary', ordered=True) d2 = pa.dictionary('int32', 'binary', ordered=False) d3 = pa.dictionary('int8', 'binary', ordered=True) d4 = pa.dictionary('int32', 'binary', ordered=True) assert not d1.equals(d2) assert not d1.equals(d3) assert d1.equals(d4)
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.float32() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True
def test_auto_dict_encode(self): opts = ConvertOptions(auto_dict_encode=True) rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode() table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())), ('b', pa.int64())]) expected = { 'a': ["ab", "cdé", "cdé", "ab"], 'b': [1, 2, 3, 4], } assert table.schema == schema assert table.to_pydict() == expected opts.auto_dict_max_cardinality = 2 table = self.read_bytes(rows, convert_options=opts) assert table.schema == schema assert table.to_pydict() == expected # Cardinality above max => plain-encoded opts.auto_dict_max_cardinality = 1 table = self.read_bytes(rows, convert_options=opts) assert table.schema == pa.schema([('a', pa.string()), ('b', pa.int64())]) assert table.to_pydict() == expected # With invalid UTF8, not checked opts.auto_dict_max_cardinality = 50 opts.check_utf8 = False rows = b"a,b\nab,1\ncd\xff,2\nab,3" table = self.read_bytes(rows, convert_options=opts, validate_full=False) assert table.schema == schema dict_values = table['a'].chunk(0).dictionary assert len(dict_values) == 2 assert dict_values[0].as_py() == "ab" assert dict_values[1].as_buffer() == b"cd\xff" # With invalid UTF8, checked opts.check_utf8 = True table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())), ('b', pa.int64())]) expected = { 'a': [b"ab", b"cd\xff", b"ab"], 'b': [1, 2, 3], } assert table.schema == schema assert table.to_pydict() == expected
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.large_list(pa.uint8()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_dictionary_delta(stream_fixture): ty = pa.dictionary(pa.int8(), pa.utf8()) data = [["foo", "foo", None], ["foo", "bar", "foo"], # potential delta ["foo", "bar"], ["foo", None, "bar", "quux"], # potential delta ["bar", "quux"], # replacement ] batches = [ pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts']) for v in data] schema = batches[0].schema def write_batches(): with stream_fixture._get_writer(pa.MockOutputStream(), schema) as writer: for batch in batches: writer.write_batch(batch) return writer.stats st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 3 assert st.num_dictionary_deltas == 0 stream_fixture.use_legacy_ipc_format = None stream_fixture.options = pa.ipc.IpcWriteOptions( emit_dictionary_deltas=True) st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 1 assert st.num_dictionary_deltas == 2
def decode(encoding, type_spec): if isinstance(type_spec, dict): if type_spec['type'] == 'duration': return DataType(pa.duration(type_spec['unit'])) elif type_spec['type'] == 'timestamp': return DataType(pa.timestamp(type_spec['unit'])) elif type_spec['type'] == 'list': sub = encoding.decode('dtype', type_spec['value_type']).arrow return DataType(pa.list_(sub)) elif type_spec['type'] == 'dict': value_type = encoding.decode('dtype', type_spec["value_type"]).arrow index_type = encoding.decode('dtype', type_spec["index_type"]).arrow bool_ordered = type_spec["ordered"] return DataType(pa.dictionary(index_type, value_type, bool_ordered)) else: raise ValueError(f'Do not understand type {type_spec}') if type_spec == 'string': return DataType(pa.string()) if type_spec == 'large_string': return DataType(pa.large_string()) # TODO: find a proper way to support all arrow types if type_spec == 'timestamp[ms]': return DataType(pa.timestamp('ms')) else: return DataType(np.dtype(type_spec))
def test_arrow_schema_category_column(self): self.assertEqual( arrow_schema_to_render_columns( pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))]) ), {"A": RenderColumn("A", "text", None)}, )
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False
def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): if column_params.dtype is None: column_params.generator = column_params.generator( Generic("en", seed=parameters.seed)) else: column_params.generator = column_params.generator() # Get schema for each column schema = pa.schema([ pa.field( name=str(i), type=pa.dictionary( index_type=pa.int64(), value_type=pa.from_numpy_dtype( type(next(iter(column_params.generator)))), ) if isinstance(column_params.dtype, str) and column_params.dtype == "category" else pa.from_numpy_dtype( type(next(iter(column_params.generator))) if column_params.dtype is None else column_params.dtype), nullable=column_params.null_frequency > 0, ) for i, column_params in enumerate(parameters.column_parameters) ]) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) return tbl
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
def test_dict_col(tmpdir): # Create the file if necessary parquet_path = tmpdir / 'sample_arrow_dict.parquet' schema = pa.schema({ 'col1': pa.int32(), 'col2': pa.float32(), 'col3': pa.dictionary(pa.int16(), pa.string()), }) table = pa.table( { 'col1': range(10), 'col2': np.random.randn(10), 'col3': list(np.random.choice(['A', 'B', 'C'], 10)), }, schema=schema) pq.write_table(table, parquet_path) # Load df df = vaex.open(parquet_path) dtypes = df.dtypes assert isinstance(dtypes["col3"].arrow, pa.lib.DictionaryType) # Filter df = df._future() dff1 = df[df["col3"] == 'A'] assert dff1["col3"].unique() == ["A"]
def test_arrow(): a = Series("a", [1, 2, 3, None]) out = a.to_arrow() assert out == pa.array([1, 2, 3, None]) a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8())) s = pl.Series("a", a) assert s.dtype == pl.Utf8
def test_filesystem_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions( dictionary_columns={"str"})) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals(pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()), pa.field('str', pa.dictionary(pa.int32(), pa.string())), pa.field('const', pa.int64()), pa.field('group', pa.int32()), pa.field('key', pa.string()), ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.ScalarExpression(True)) dataset = factory.finish() assert isinstance(dataset, ds.FileSystemDataset) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), pa.array("0 1 2 3 4".split(), type=pa.string())) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) for batch in task.execute(): assert batch.num_columns == 6 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_str) assert batch[3].equals(expected_const) assert batch[4].equals(expected_group) assert batch[5].equals(expected_key) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 6
def test_dictionary_python(): """ Python -> Rust -> Python """ a = pa.array(["a", None, "b", None, "a"], type=pa.dictionary(pa.int8(), pa.string())) b = rust.round_trip_array(a) assert a == b del a del b
def _dtype_to_arrow(cls, dtype): if dtype is None: return None tname = dtype if isinstance(dtype, str) else dtype.name if tname == "category": return pa.dictionary(index_type=pa.int32(), value_type=pa.string()) elif tname == "string": return pa.string() else: return pa.from_numpy_dtype(tname)
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', ['a', 'b', 'c', 'd']) assert ty2.index_type == pa.int8() assert isinstance(ty2.dictionary, pa.Array) assert ty2.dictionary.to_pylist() == ['a', 'b', 'c', 'd'] assert ty2.ordered is False
def test_text_dictionary_zero_chunks_is_valid(self): validate( pyarrow.Table.from_batches( [], pyarrow.schema([("A", pyarrow.dictionary(pyarrow.int32(), pyarrow.string()))]), ), TableMetadata(0, [Text("A")]), )
def test_cat_int_types_3500() -> None: with pl.StringCache(): # Create an enum / categorical / dictionary typed pyarrow array # Most simply done by creating a pandas categorical series first categorical_df = pd.Series(["a", "a", "b"], dtype="category") pyarrow_array = pa.Array.from_pandas(categorical_df) # The in-memory representation of each category can either be a signed or unsigned 8-bit integer # Pandas uses Int8... int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8()) # ... while DuckDB uses UInt8 uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8()) for t in [int_dict_type, uint_dict_type]: s = pl.from_arrow(pyarrow_array.cast(t)) assert s.series_equal( pl.Series(["a", "a", "b"]).cast(pl.Categorical))
def test_empty_table(): schema = pa.schema([ pa.field('f0', pa.int64()), pa.field('f1', pa.dictionary(pa.int32(), pa.string())), pa.field('f2', pa.list_(pa.list_(pa.int64()))), ]) table = schema.empty_table() assert isinstance(table, pa.Table) assert table.num_rows == 0 assert table.schema == schema
def schema(cls): return pa.schema( { "instrument_id": pa.dictionary(pa.int8(), pa.string()), "ts_event": pa.int64(), "ts_init": pa.int64(), "last_traded_price": pa.string(), "traded_volume": pa.string(), }, metadata={"type": "BetfairTicker"}, )
def test_arrow(): a = pl.Series("a", [1, 2, 3, None]) out = a.to_arrow() assert out == pa.array([1, 2, 3, None]) a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8())) s = pl.Series("a", a) assert s.dtype == pl.Categorical assert (pl.from_arrow( pa.array([["foo"], ["foo", "bar"]], pa.list_(pa.utf8()))).dtype == pl.List)
def test_schema_repr_with_dictionaries(): fields = [ pa.field('one', pa.dictionary(pa.int16(), pa.string())), pa.field('two', pa.int32()) ] sch = pa.schema(fields) expected = ("""\ one: dictionary<values=string, indices=int16, ordered=0> two: int32""") assert repr(sch) == expected
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False # allow unsigned integers for index type ty3 = pa.dictionary(pa.uint32(), pa.string()) assert ty3.index_type == pa.uint32() assert ty3.value_type == pa.string() assert ty3.ordered is False # invalid index type raises with pytest.raises(TypeError): pa.dictionary(pa.string(), pa.int64())
def test_schema_repr_with_dictionaries(): fields = [ pa.field('one', pa.dictionary(pa.int16(), pa.string())), pa.field('two', pa.int32()) ] sch = pa.schema(fields) expected = ( """\ one: dictionary<values=string, indices=int16, ordered=0> two: int32""") assert repr(sch) == expected
def test_dict(self): """ Python -> Rust -> Python """ a = pyarrow.array( ["a", "a", "b", None, "c"], pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()), ) b = arrow_pyarrow_integration_testing.round_trip_array(a) b.validate(full=True) assert a.to_pylist() == b.to_pylist() assert a.type == b.type
def test_build_table_infer_type(): table = make_table( make_column("A", ["x"]), make_column("B", [datetime.date(2021, 4, 7)]), make_column("C", [datetime.datetime(2021, 4, 7, 19, 24, 1, 1)]), make_column("D", [1.0]), make_column("dict", ["x"], dictionary=True), ) assert table["A"].type == pa.string() assert table["B"].type == pa.date32() assert table["C"].type == pa.timestamp("ns") assert table["D"].type == pa.float64() assert table["dict"].type == pa.dictionary(pa.int32(), pa.string())
def create_cems_schema(): """Make an explicit Arrow schema for the EPA CEMS data. Make changes in the types of the generated parquet files by editing this function. Note that parquet's internal representation doesn't use unsigned numbers or 16-bit ints, so just keep things simple here and always use int32 and float32. Returns: pyarrow.schema: An Arrow schema for the EPA CEMS data. """ int_nullable = partial(pa.field, type=pa.int32(), nullable=True) int_not_null = partial(pa.field, type=pa.int32(), nullable=False) str_not_null = partial(pa.field, type=pa.string(), nullable=False) # Timestamp resolution is hourly, but second is the largest allowed. timestamp = partial(pa.field, type=pa.timestamp("s", tz="UTC"), nullable=False) float_nullable = partial(pa.field, type=pa.float32(), nullable=True) float_not_null = partial(pa.field, type=pa.float32(), nullable=False) # (float32 can accurately hold integers up to 16,777,216 so no need for # float64) dict_nullable = partial(pa.field, type=pa.dictionary(pa.int8(), pa.string(), ordered=False), nullable=True) return pa.schema([ dict_nullable("state"), int_not_null("plant_id_eia"), str_not_null("unitid"), timestamp("operating_datetime_utc"), float_nullable("operating_time_hours"), float_not_null("gross_load_mw"), float_nullable("steam_load_1000_lbs"), float_nullable("so2_mass_lbs"), dict_nullable("so2_mass_measurement_code"), float_nullable("nox_rate_lbs_mmbtu"), dict_nullable("nox_rate_measurement_code"), float_nullable("nox_mass_lbs"), dict_nullable("nox_mass_measurement_code"), float_nullable("co2_mass_tons"), dict_nullable("co2_mass_measurement_code"), float_not_null("heat_content_mmbtu"), int_nullable("facility_id"), int_nullable("unit_id_epa"), int_not_null("year"), ])
def test_schema_repr_with_dictionaries(): dct = pa.array(['foo', 'bar', 'baz'], type=pa.string()) fields = [ pa.field('one', pa.dictionary(pa.int16(), dct)), pa.field('two', pa.int32()) ] sch = pa.schema(fields) expected = ( """\ one: dictionary<values=string, indices=int16, ordered=0> dictionary: ["foo", "bar", "baz"] two: int32""") assert repr(sch) == expected
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_is_dictionary(): assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string())) assert not types.is_dictionary(pa.int32())
def test_dictionary_type(): ty = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty.index_type == pa.int32() assert ty.dictionary.to_pylist() == ['a', 'b', 'c']
def test_is_dictionary(): assert types.is_dictionary( pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))) assert not types.is_dictionary(pa.int32())