Esempio n. 1
0
    def _validate_ordering(self, ordering, schema, null_pages, min_values,
                           max_values):
        """Check if the ordering of the values reflects the value of 'ordering'."""
        def is_sorted(l, reverse=False):
            if not reverse:
                return all(a <= b for a, b in zip(l, l[1:]))
            else:
                return all(a >= b for a, b in zip(l, l[1:]))

        # Filter out null pages and decode the actual min/max values.
        actual_min_values = [
            decode_stats_value(schema, min_val)
            for min_val, is_null in zip(min_values, null_pages) if not is_null
        ]
        actual_max_values = [
            decode_stats_value(schema, max_val)
            for max_val, is_null in zip(max_values, null_pages) if not is_null
        ]

        # For ASCENDING and DESCENDING, both min and max values need to be sorted.
        if ordering == BoundaryOrder.ASCENDING:
            assert is_sorted(actual_min_values)
            assert is_sorted(actual_max_values)
        elif ordering == BoundaryOrder.DESCENDING:
            assert is_sorted(actual_min_values, reverse=True)
            assert is_sorted(actual_max_values, reverse=True)
        else:
            assert ordering == BoundaryOrder.UNORDERED
            # For UNORDERED, min and max values cannot be both sorted.
            assert not is_sorted(actual_min_values) or not is_sorted(
                actual_max_values)
            assert (not is_sorted(actual_min_values, reverse=True)
                    or not is_sorted(actual_max_values, reverse=True))
Esempio n. 2
0
    def _validate_min_max_values(self, index_size, column_info):
        """Validate min/max values of the pages in a column chunk."""
        column_index = column_info.column_index
        min_values = column_info.column_index.min_values
        assert len(min_values) == index_size
        max_values = column_info.column_index.max_values
        assert len(max_values) == index_size

        if not column_info.stats:
            return

        column_min_value_str = column_info.stats.min_value
        column_max_value_str = column_info.stats.max_value
        if column_min_value_str is None or column_max_value_str is None:
            # If either is None, then both need to be None.
            assert column_min_value_str is None and column_max_value_str is None
            # No min and max value, all pages need to be null
            for idx, null_page in enumerate(column_index.null_pages):
                assert null_page, "Page {} of column {} is not null, \
            but doesn't have min and max values!".format(
                    idx, column_index.schema.name)
            # Everything is None, no further checks needed.
            return

        column_min_value = decode_stats_value(column_info.schema,
                                              column_min_value_str)
        for null_page, page_min_str in zip(column_index.null_pages,
                                           min_values):
            if not null_page:
                page_min_value = decode_stats_value(column_info.schema,
                                                    page_min_str)
                # If type is str, page_min_value might have been truncated.
                if isinstance(page_min_value, basestring):
                    assert page_min_value >= column_min_value[:len(
                        page_min_value)]
                else:
                    assert page_min_value >= column_min_value

        column_max_value = decode_stats_value(column_info.schema,
                                              column_max_value_str)
        for null_page, page_max_str in zip(column_index.null_pages,
                                           max_values):
            if not null_page:
                page_max_value = decode_stats_value(column_info.schema,
                                                    page_max_str)
                # If type is str, page_max_value might have been truncated and incremented.
                if (isinstance(page_max_value, basestring) and
                        len(page_max_value) == PAGE_INDEX_MAX_STRING_LENGTH):
                    max_val_prefix = page_max_value.rstrip('\0')
                    assert max_val_prefix[:-1] <= column_max_value
                else:
                    assert page_max_value <= column_max_value
Esempio n. 3
0
  def _decode_row_group_stats(self, schemas, row_group_stats):
    """Decodes and return a list of statistics for a single row group."""
    decoded = []
    assert len(schemas) == len(row_group_stats)
    for schema, stats in zip(schemas, row_group_stats):
      if stats is None:
        decoded.append(None)
        continue

      if stats.min_value is None and stats.max_value is None:
        decoded.append(None)
        continue

      assert stats.min_value is not None and stats.max_value is not None
      min_value = decode_stats_value(schema, stats.min_value)
      max_value = decode_stats_value(schema, stats.max_value)
      decoded.append(ColumnStats(schema.name, min_value, max_value))

    assert len(decoded) == len(schemas)
    return decoded
Esempio n. 4
0
  def _decode_row_group_stats(self, schemas, row_group_stats):
    """Decodes and return a list of statistics for a single row group."""
    decoded = []
    assert len(schemas) == len(row_group_stats)
    for schema, stats in zip(schemas, row_group_stats):
      if stats is None:
        decoded.append(None)
        continue

      if stats.min is None and stats.max is None:
        decoded.append(None)
        continue

      assert stats.min is not None and stats.max is not None
      min_value = decode_stats_value(schema, stats.min)
      max_value = decode_stats_value(schema, stats.max)
      decoded.append(ColumnStats(schema.name, min_value, max_value))

    assert len(decoded) == len(schemas)
    return decoded