def test_coerce_timestamps_truncated(tempdir): """ ARROW-2555: Test that we can truncate timestamps when coercing if explicitly allowed. """ dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1, second=1, microsecond=1) dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1, second=1) fields_us = [pa.field('datetime64', pa.timestamp('us'))] arrays_us = {'datetime64': [dt_us, dt_ms]} df_us = pd.DataFrame(arrays_us) schema_us = pa.schema(fields_us) filename = tempdir / 'pandas_truncated.parquet' table_us = pa.Table.from_pandas(df_us, schema=schema_us) _write_table(table_us, filename, version="2.0", coerce_timestamps='ms', allow_truncated_timestamps=True) table_ms = _read_table(filename) df_ms = table_ms.to_pandas() arrays_expected = {'datetime64': [dt_ms, dt_ms]} df_expected = pd.DataFrame(arrays_expected) tm.assert_frame_equal(df_expected, df_ms)
def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( tempdir, use_legacy_dataset ): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = arrow_table.schema.pandas_metadata assert not js['index_columns'] # ARROW-2170 # While index_columns should be empty, columns needs to be filled still. assert js['columns'] _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset) js = table_read.schema.pandas_metadata assert not js['index_columns'] read_metadata = table_read.schema.metadata assert arrow_table.schema.metadata == read_metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_read_single_row_group_with_column_subset(): N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) cols = list(df.columns[:2]) row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df[cols], result.to_pandas()) # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)] result = pa.concat_tables(row_groups) tm.assert_frame_equal(df[cols], result.to_pandas())
def test_pandas_column_selection(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) table_read = _read_table(filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. table_read = _read_table(filename, columns=['uint8', 'uint8'], use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read)
def test_write_error_deletes_incomplete_file(tempdir): # ARROW-1285 df = pd.DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, freq='ns') }) pdf = pa.Table.from_pandas(df) filename = tempdir / 'tmp_file' try: _write_table(pdf, filename) except pa.ArrowException: pass assert not filename.exists()
def test_parquet_invalid_version(tempdir): table = pa.table({'a': [1, 2, 3]}) with pytest.raises(ValueError, match="Unsupported Parquet format version"): _write_table(table, tempdir / 'test_version.parquet', version="2.2") with pytest.raises(ValueError, match="Unsupported Parquet data page " + "version"): _write_table(table, tempdir / 'test_version.parquet', data_page_version="2.2")
def test_special_chars_filename(tempdir, use_legacy_dataset): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table)
def test_decimal_roundtrip_negative_scale(tempdir): expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]}) filename = tempdir / 'decimals.parquet' string_filename = str(filename) t = pa.Table.from_pandas(expected) _write_table(t, string_filename) result_table = _read_table(string_filename) result = result_table.to_pandas() tm.assert_frame_equal(result, expected)
def test_column_of_arrays(tempdir): df, schema = dataframe_with_arrays() filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms') table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = _read_table( reader, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(df, df_read)
def test_column_of_lists(tempdir): df, schema = dataframe_with_lists(parquet_compatible=True) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version='2.0') table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_large_table_int32_overflow(): size = np.iinfo('int32').max + 1 arr = np.ones(size, dtype='uint8') parr = pa.array(arr, type=pa.uint8()) table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() _write_table(table, f)
def test_large_binary_overflow(): s = b'x' * (1 << 31) arr = pa.array([s], type=pa.large_binary()) table = pa.Table.from_arrays([arr], names=['strs']) for use_dictionary in [False, True]: writer = pa.BufferOutputStream() with pytest.raises( pa.ArrowInvalid, match="Parquet cannot store strings with size 2GB or more"): _write_table(table, writer, use_dictionary=use_dictionary)
def test_compression_level(use_legacy_dataset): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", compression_level=1, use_legacy_dataset=use_legacy_dataset) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", compression_level=5, use_legacy_dataset=use_legacy_dataset) # Check that the user can provide a compression per column _check_roundtrip(table, expected=table, compression={ 'a': "gzip", 'b': "snappy" }, use_legacy_dataset=use_legacy_dataset) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", compression_level={ 'a': 2, 'b': 3 }, use_legacy_dataset=use_legacy_dataset) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. # Uncompressed, snappy, lz4 and lzo do not support specifying a compression # level. # GZIP (zlib) allows for specifying a compression level but as of up # to version 1.2.11 the valid range is [-1, 9]. invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337), ("None", 444), ("lzo", 14)] buf = io.BytesIO() for (codec, level) in invalid_combinations: with pytest.raises((ValueError, OSError)): _write_table(table, buf, compression=codec, compression_level=level)
def test_read_pandas_passthrough_keywords(tempdir): # ARROW-11464 - previously not all keywords were passed through (such as # the filesystem keyword) df = pd.DataFrame({'a': [1, 2, 3]}) filename = tempdir / 'data.parquet' _write_table(df, filename) result = pq.read_pandas('data.parquet', filesystem=SubTreeFileSystem( str(tempdir), LocalFileSystem())) assert result.equals(pa.table(df))
def test_pre_buffer(pre_buffer): N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.6') buf.seek(0) pf = pq.ParquetFile(buf, pre_buffer=pre_buffer) assert pf.read().num_rows == N
def test_read_pandas_column_subset(tempdir, use_legacy_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
def test_memory_map(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'memory_map': True}, version='2.6', use_legacy_dataset=use_legacy_dataset) filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') table_read = pq.read_pandas(filename, memory_map=True, use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table)
def test_enable_buffered_stream(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, version='2.6', use_legacy_dataset=use_legacy_dataset) filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') table_read = pq.read_pandas(filename, buffer_size=4096, use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table)
def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] assert table.field(i).name == table_read.field(i).name assert col_read.num_chunks == 1 data_written = col_written.chunk(0) data_read = col_read.chunk(0) assert data_written.equals(data_read)
def test_scan_contents(): N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, row_group_size=N / K, compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) assert pf.scan_contents() == 10000 assert pf.scan_contents(df.columns[:4]) == 10000
def test_pandas_can_write_nested_data(tempdir): data = { "agg_col": [ {"page_type": 1}, {"record_type": 1}, {"non_consecutive_home": 0}, ], "uid_first": "1001" } df = pd.DataFrame(data=data) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() # This succeeds under V2 _write_table(arrow_table, imos)
def test_min_chunksize(use_legacy_dataset): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) buf = io.BytesIO() _write_table(table, buf, chunk_size=-1) buf.seek(0) result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) with pytest.raises(ValueError): _write_table(table, buf, chunk_size=0)
def test_fspath(tempdir, use_legacy_dataset): # ARROW-12472 support __fspath__ objects without using str() path = tempdir / "test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path) fs_protocol_obj = util.FSProtocolClass(path) result = _read_table(fs_protocol_obj, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) # combined with non-local filesystem raises with pytest.raises(TypeError): _read_table(fs_protocol_obj, filesystem=FileSystem())
def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples(list( zip(df.columns, df.columns[::-1])), names=['level_1', 'level_2']) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') table_read = pq.read_pandas(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_multiple_path_types(tempdir, use_legacy_dataset): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) # Test compatibility with plain string paths path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_multithreaded_read(use_legacy_dataset): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(table, buf, compression='SNAPPY', version='2.6') buf.seek(0) table1 = _read_table( buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) buf.seek(0) table2 = _read_table( buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) assert table1.equals(table2)
def test_pandas_parquet_custom_metadata(tempdir): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') metadata = pq.read_metadata(filename).metadata assert b'pandas' in metadata js = json.loads(metadata[b'pandas'].decode('utf8')) assert js['index_columns'] == [{'kind': 'range', 'name': None, 'start': 0, 'stop': 10000, 'step': 1}]
def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.read_metadata(buf) buf.seek(0) fileh = pq.ParquetFile(buf, metadata=metadata) tm.assert_frame_equal(df, fileh.read().to_pandas())