def _validate_ordering(self, ordering, schema, null_pages, min_values, max_values): """Check if the ordering of the values reflects the value of 'ordering'.""" def is_sorted(l, reverse=False): if not reverse: return all(a <= b for a, b in zip(l, l[1:])) else: return all(a >= b for a, b in zip(l, l[1:])) # Filter out null pages and decode the actual min/max values. actual_min_values = [ decode_stats_value(schema, min_val) for min_val, is_null in zip(min_values, null_pages) if not is_null ] actual_max_values = [ decode_stats_value(schema, max_val) for max_val, is_null in zip(max_values, null_pages) if not is_null ] # For ASCENDING and DESCENDING, both min and max values need to be sorted. if ordering == BoundaryOrder.ASCENDING: assert is_sorted(actual_min_values) assert is_sorted(actual_max_values) elif ordering == BoundaryOrder.DESCENDING: assert is_sorted(actual_min_values, reverse=True) assert is_sorted(actual_max_values, reverse=True) else: assert ordering == BoundaryOrder.UNORDERED # For UNORDERED, min and max values cannot be both sorted. assert not is_sorted(actual_min_values) or not is_sorted( actual_max_values) assert (not is_sorted(actual_min_values, reverse=True) or not is_sorted(actual_max_values, reverse=True))
def _validate_min_max_values(self, index_size, column_info): """Validate min/max values of the pages in a column chunk.""" column_index = column_info.column_index min_values = column_info.column_index.min_values assert len(min_values) == index_size max_values = column_info.column_index.max_values assert len(max_values) == index_size if not column_info.stats: return column_min_value_str = column_info.stats.min_value column_max_value_str = column_info.stats.max_value if column_min_value_str is None or column_max_value_str is None: # If either is None, then both need to be None. assert column_min_value_str is None and column_max_value_str is None # No min and max value, all pages need to be null for idx, null_page in enumerate(column_index.null_pages): assert null_page, "Page {} of column {} is not null, \ but doesn't have min and max values!".format( idx, column_index.schema.name) # Everything is None, no further checks needed. return column_min_value = decode_stats_value(column_info.schema, column_min_value_str) for null_page, page_min_str in zip(column_index.null_pages, min_values): if not null_page: page_min_value = decode_stats_value(column_info.schema, page_min_str) # If type is str, page_min_value might have been truncated. if isinstance(page_min_value, basestring): assert page_min_value >= column_min_value[:len( page_min_value)] else: assert page_min_value >= column_min_value column_max_value = decode_stats_value(column_info.schema, column_max_value_str) for null_page, page_max_str in zip(column_index.null_pages, max_values): if not null_page: page_max_value = decode_stats_value(column_info.schema, page_max_str) # If type is str, page_max_value might have been truncated and incremented. if (isinstance(page_max_value, basestring) and len(page_max_value) == PAGE_INDEX_MAX_STRING_LENGTH): max_val_prefix = page_max_value.rstrip('\0') assert max_val_prefix[:-1] <= column_max_value else: assert page_max_value <= column_max_value
def _decode_row_group_stats(self, schemas, row_group_stats): """Decodes and return a list of statistics for a single row group.""" decoded = [] assert len(schemas) == len(row_group_stats) for schema, stats in zip(schemas, row_group_stats): if stats is None: decoded.append(None) continue if stats.min_value is None and stats.max_value is None: decoded.append(None) continue assert stats.min_value is not None and stats.max_value is not None min_value = decode_stats_value(schema, stats.min_value) max_value = decode_stats_value(schema, stats.max_value) decoded.append(ColumnStats(schema.name, min_value, max_value)) assert len(decoded) == len(schemas) return decoded
def _decode_row_group_stats(self, schemas, row_group_stats): """Decodes and return a list of statistics for a single row group.""" decoded = [] assert len(schemas) == len(row_group_stats) for schema, stats in zip(schemas, row_group_stats): if stats is None: decoded.append(None) continue if stats.min is None and stats.max is None: decoded.append(None) continue assert stats.min is not None and stats.max is not None min_value = decode_stats_value(schema, stats.min) max_value = decode_stats_value(schema, stats.max) decoded.append(ColumnStats(schema.name, min_value, max_value)) assert len(decoded) == len(schemas) return decoded