Ejemplo n.º 1
0
    def _get_row_group_from_file(self, parquet_file):
        """Returns namedtuples that contain the schema, stats, offset_index, column_index,
    and page_headers for each column in the first row group in file 'parquet_file'. Fails
    if the file contains multiple row groups.
    """
        ColumnInfo = namedtuple('ColumnInfo', [
            'schema', 'stats', 'offset_index', 'column_index', 'page_headers'
        ])

        file_meta_data = get_parquet_metadata(parquet_file)
        assert len(file_meta_data.row_groups) == 1
        # We only support flat schemas, the additional element is the root element.
        schemas = file_meta_data.schema[1:]
        row_group = file_meta_data.row_groups[0]
        assert len(schemas) == len(row_group.columns)
        row_group_index = []
        with open(parquet_file) as file_handle:
            for column, schema in zip(row_group.columns, schemas):
                column_index_offset = column.column_index_offset
                column_index_length = column.column_index_length
                column_index = None
                if column_index_offset and column_index_length:
                    column_index = read_serialized_object(
                        ColumnIndex, file_handle, column_index_offset,
                        column_index_length)
                column_meta_data = column.meta_data
                stats = None
                if column_meta_data:
                    stats = column_meta_data.statistics

                offset_index_offset = column.offset_index_offset
                offset_index_length = column.offset_index_length
                offset_index = None
                page_headers = []
                if offset_index_offset and offset_index_length:
                    offset_index = read_serialized_object(
                        OffsetIndex, file_handle, offset_index_offset,
                        offset_index_length)
                    for page_loc in offset_index.page_locations:
                        page_header = read_serialized_object(
                            PageHeader, file_handle, page_loc.offset,
                            page_loc.compressed_page_size)
                        page_headers.append(page_header)

                column_info = ColumnInfo(schema, stats, offset_index,
                                         column_index, page_headers)
                row_group_index.append(column_info)
        return row_group_index
Ejemplo n.º 2
0
 def _try_read_bloom_filter_header(self, file_handle, bloom_filter_offset):
     """ Returns the Bloom filter header and its size. If it is not found, None is returned
 instead of the header and the size is unspecified. """
     header = None
     header_size = 8
     while (header_size <= 1024 and header is None):
         try:
             header = read_serialized_object(BloomFilterHeader, file_handle,
                                             bloom_filter_offset,
                                             header_size)
         except EOFError:
             header_size *= 2
     return (header, header_size)