def test_compare_schemas(): df = alltypes_sample(size=10000) fileh = make_sample_file(df) fileh2 = make_sample_file(df) fileh3 = make_sample_file(df[df.columns[::2]]) # ParquetSchema assert isinstance(fileh.schema, pq.ParquetSchema) assert fileh.schema.equals(fileh.schema) assert fileh.schema == fileh.schema assert fileh.schema.equals(fileh2.schema) assert fileh.schema == fileh2.schema assert fileh.schema != 'arbitrary object' assert not fileh.schema.equals(fileh3.schema) assert fileh.schema != fileh3.schema # ColumnSchema assert isinstance(fileh.schema[0], pq.ColumnSchema) assert fileh.schema[0].equals(fileh.schema[0]) assert fileh.schema[0] == fileh.schema[0] assert not fileh.schema[0].equals(fileh.schema[1]) assert fileh.schema[0] != fileh.schema[1] assert fileh.schema[0] != 'arbitrary object'
def test_parquet_column_statistics_api(data, type, physical_type, min_value, max_value, null_count, num_values, distinct_count): df = pd.DataFrame({'data': data}) schema = pa.schema([pa.field('data', type)]) table = pa.Table.from_pandas(df, schema=schema, safe=False) fileh = make_sample_file(table) meta = fileh.metadata rg_meta = meta.row_group(0) col_meta = rg_meta.column(0) stat = col_meta.statistics assert stat.has_min_max assert _close(type, stat.min, min_value) assert _close(type, stat.max, max_value) assert stat.null_count == null_count assert stat.num_values == num_values # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount # method, missing distinct_count is represented as zero instead of None assert stat.distinct_count == distinct_count assert stat.physical_type == physical_type
def test_parquet_metadata_api(): df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) df.index = np.random.randint(0, 1000000, size=len(df)) fileh = make_sample_file(df) ncols = len(df.columns) # Series of sniff tests meta = fileh.metadata repr(meta) assert meta.num_rows == len(df) assert meta.num_columns == ncols + 1 # +1 for index assert meta.num_row_groups == 1 assert meta.format_version == '2.0' assert 'parquet-cpp' in meta.created_by assert isinstance(meta.serialized_size, int) assert isinstance(meta.metadata, dict) # Schema schema = fileh.schema assert meta.schema is schema assert len(schema) == ncols + 1 # +1 for index repr(schema) col = schema[0] repr(col) assert col.name == df.columns[0] assert col.max_definition_level == 1 assert col.max_repetition_level == 0 assert col.max_repetition_level == 0 assert col.physical_type == 'BOOLEAN' assert col.converted_type == 'NONE' with pytest.raises(IndexError): schema[ncols + 1] # +1 for index with pytest.raises(IndexError): schema[-1] # Row group for rg in range(meta.num_row_groups): rg_meta = meta.row_group(rg) assert isinstance(rg_meta, pq.RowGroupMetaData) repr(rg_meta) for col in range(rg_meta.num_columns): col_meta = rg_meta.column(col) assert isinstance(col_meta, pq.ColumnChunkMetaData) repr(col_meta) with pytest.raises(IndexError): meta.row_group(-1) with pytest.raises(IndexError): meta.row_group(meta.num_row_groups + 1) rg_meta = meta.row_group(0) assert rg_meta.num_rows == len(df) assert rg_meta.num_columns == ncols + 1 # +1 for index assert rg_meta.total_byte_size > 0 with pytest.raises(IndexError): col_meta = rg_meta.column(-1) with pytest.raises(IndexError): col_meta = rg_meta.column(ncols + 2) col_meta = rg_meta.column(0) assert col_meta.file_offset > 0 assert col_meta.file_path == '' # created from BytesIO assert col_meta.physical_type == 'BOOLEAN' assert col_meta.num_values == 10000 assert col_meta.path_in_schema == 'bool' assert col_meta.is_stats_set is True assert isinstance(col_meta.statistics, pq.Statistics) assert col_meta.compression == 'SNAPPY' assert col_meta.encodings == ('PLAIN', 'RLE') assert col_meta.has_dictionary_page is False assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0 assert col_meta.total_compressed_size > 0 assert col_meta.total_uncompressed_size > 0 with pytest.raises(NotImplementedError): col_meta.has_index_page with pytest.raises(NotImplementedError): col_meta.index_page_offset
def test_parquet_raise_on_unset_statistics(): df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")}) meta = make_sample_file(pa.Table.from_pandas(df)).metadata assert not meta.row_group(0).column(0).statistics.has_min_max assert meta.row_group(0).column(0).statistics.max is None