Exemple #1
0
def test_compare_schemas():
    df = alltypes_sample(size=10000)

    fileh = make_sample_file(df)
    fileh2 = make_sample_file(df)
    fileh3 = make_sample_file(df[df.columns[::2]])

    # ParquetSchema
    assert isinstance(fileh.schema, pq.ParquetSchema)
    assert fileh.schema.equals(fileh.schema)
    assert fileh.schema == fileh.schema
    assert fileh.schema.equals(fileh2.schema)
    assert fileh.schema == fileh2.schema
    assert fileh.schema != 'arbitrary object'
    assert not fileh.schema.equals(fileh3.schema)
    assert fileh.schema != fileh3.schema

    # ColumnSchema
    assert isinstance(fileh.schema[0], pq.ColumnSchema)
    assert fileh.schema[0].equals(fileh.schema[0])
    assert fileh.schema[0] == fileh.schema[0]
    assert not fileh.schema[0].equals(fileh.schema[1])
    assert fileh.schema[0] != fileh.schema[1]
    assert fileh.schema[0] != 'arbitrary object'
Exemple #2
0
def test_parquet_column_statistics_api(data, type, physical_type, min_value,
                                       max_value, null_count, num_values,
                                       distinct_count):
    df = pd.DataFrame({'data': data})
    schema = pa.schema([pa.field('data', type)])
    table = pa.Table.from_pandas(df, schema=schema, safe=False)
    fileh = make_sample_file(table)

    meta = fileh.metadata

    rg_meta = meta.row_group(0)
    col_meta = rg_meta.column(0)

    stat = col_meta.statistics
    assert stat.has_min_max
    assert _close(type, stat.min, min_value)
    assert _close(type, stat.max, max_value)
    assert stat.null_count == null_count
    assert stat.num_values == num_values
    # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
    # method, missing distinct_count is represented as zero instead of None
    assert stat.distinct_count == distinct_count
    assert stat.physical_type == physical_type
Exemple #3
0
def test_parquet_metadata_api():
    df = alltypes_sample(size=10000)
    df = df.reindex(columns=sorted(df.columns))
    df.index = np.random.randint(0, 1000000, size=len(df))

    fileh = make_sample_file(df)
    ncols = len(df.columns)

    # Series of sniff tests
    meta = fileh.metadata
    repr(meta)
    assert meta.num_rows == len(df)
    assert meta.num_columns == ncols + 1  # +1 for index
    assert meta.num_row_groups == 1
    assert meta.format_version == '2.0'
    assert 'parquet-cpp' in meta.created_by
    assert isinstance(meta.serialized_size, int)
    assert isinstance(meta.metadata, dict)

    # Schema
    schema = fileh.schema
    assert meta.schema is schema
    assert len(schema) == ncols + 1  # +1 for index
    repr(schema)

    col = schema[0]
    repr(col)
    assert col.name == df.columns[0]
    assert col.max_definition_level == 1
    assert col.max_repetition_level == 0
    assert col.max_repetition_level == 0

    assert col.physical_type == 'BOOLEAN'
    assert col.converted_type == 'NONE'

    with pytest.raises(IndexError):
        schema[ncols + 1]  # +1 for index

    with pytest.raises(IndexError):
        schema[-1]

    # Row group
    for rg in range(meta.num_row_groups):
        rg_meta = meta.row_group(rg)
        assert isinstance(rg_meta, pq.RowGroupMetaData)
        repr(rg_meta)

        for col in range(rg_meta.num_columns):
            col_meta = rg_meta.column(col)
            assert isinstance(col_meta, pq.ColumnChunkMetaData)
            repr(col_meta)

    with pytest.raises(IndexError):
        meta.row_group(-1)

    with pytest.raises(IndexError):
        meta.row_group(meta.num_row_groups + 1)

    rg_meta = meta.row_group(0)
    assert rg_meta.num_rows == len(df)
    assert rg_meta.num_columns == ncols + 1  # +1 for index
    assert rg_meta.total_byte_size > 0

    with pytest.raises(IndexError):
        col_meta = rg_meta.column(-1)

    with pytest.raises(IndexError):
        col_meta = rg_meta.column(ncols + 2)

    col_meta = rg_meta.column(0)
    assert col_meta.file_offset > 0
    assert col_meta.file_path == ''  # created from BytesIO
    assert col_meta.physical_type == 'BOOLEAN'
    assert col_meta.num_values == 10000
    assert col_meta.path_in_schema == 'bool'
    assert col_meta.is_stats_set is True
    assert isinstance(col_meta.statistics, pq.Statistics)
    assert col_meta.compression == 'SNAPPY'
    assert col_meta.encodings == ('PLAIN', 'RLE')
    assert col_meta.has_dictionary_page is False
    assert col_meta.dictionary_page_offset is None
    assert col_meta.data_page_offset > 0
    assert col_meta.total_compressed_size > 0
    assert col_meta.total_uncompressed_size > 0
    with pytest.raises(NotImplementedError):
        col_meta.has_index_page
    with pytest.raises(NotImplementedError):
        col_meta.index_page_offset
Exemple #4
0
def test_parquet_raise_on_unset_statistics():
    df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
    meta = make_sample_file(pa.Table.from_pandas(df)).metadata

    assert not meta.row_group(0).column(0).statistics.has_min_max
    assert meta.row_group(0).column(0).statistics.max is None