def test_direct_read_dictionary_subfield(use_legacy_dataset): repeats = 10 nunique = 5 data = [ [[util.rands(10)] for i in range(nunique)] * repeats, ] table = pa.table(data, names=['f0']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0.list.item'], use_legacy_dataset=use_legacy_dataset) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() inner_indices = values_as_dict.indices.cast('int32') new_values = pa.DictionaryArray.from_arrays(inner_indices, values_as_dict.dictionary) offsets = pa.array(range(51), type='int32') expected_arr = pa.ListArray.from_arrays(offsets, new_values) expected = pa.table([expected_arr], names=['f0']) assert result.equals(expected) assert result[0].num_chunks == 1
def test_deserialize_pandas_arrow_7956(): df = pd.DataFrame({ 'a': np.arange(10000), 'b': [test_util.rands(5) for _ in range(10000)] }) def action(): df_bytes = pa.ipc.serialize_pandas(df).to_pybytes() buf = pa.py_buffer(df_bytes) pa.ipc.deserialize_pandas(buf) # Abort at 128MB threshold test_util.memory_leak_check(action, threshold=1 << 27, iterations=100)
def test_leak3(): import pyarrow.parquet as pq df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] for i in range(50)}) table = pa.Table.from_pandas(df, preserve_index=False) writer = pq.ParquetWriter('leak_test_' + rands(5) + '.parquet', table.schema) def func(): writer.write_table(table, row_group_size=len(table)) # This does not "leak" per se but we do want to have this use as little # memory as possible assert_does_not_leak(func, iterations=500, check_interval=50, tolerance=20)
def test_direct_read_dictionary(use_legacy_dataset): # ARROW-3325 repeats = 10 nunique = 5 data = [ [util.rands(10) for i in range(nunique)] * repeats, ] table = pa.table(data, names=['f0']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'], use_legacy_dataset=use_legacy_dataset) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) assert result.equals(expected)
def _test_dataframe(size=10000, seed=0): import pandas as pd np.random.seed(seed) df = pd.DataFrame({ 'uint8': _random_integers(size, np.uint8), 'uint16': _random_integers(size, np.uint16), 'uint32': _random_integers(size, np.uint32), 'uint64': _random_integers(size, np.uint64), 'int8': _random_integers(size, np.int8), 'int16': _random_integers(size, np.int16), 'int32': _random_integers(size, np.int32), 'int64': _random_integers(size, np.int64), 'float32': np.random.randn(size).astype(np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': [util.rands(10) for i in range(size)], 'all_none': [None] * size, 'all_none_category': [None] * size }) # TODO(PARQUET-1015) # df['all_none_category'] = df['all_none_category'].astype('category') return df