Esempio n. 1
0
def _read_data_page(fo, schema_helper, page_header, column_metadata, dictionary):
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []
    column_path_name = ".".join(column_metadata.path_in_schema)
    logger.debug("  path_in_schema: %s", column_path_name)
    logger.debug("  definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding))
    logger.debug("  repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding))
    logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    max_repetition_level = schema_helper.max_repetition_level(column_path_name)
    logger.debug("  max_repetition_level: %s", max_repetition_level)
    repetition_levels = []
    if max_repetition_level > 0:
        bit_width = encoding.width_from_max_int(max_repetition_level)
        if bit_width == 0:
            repetition_levels = [0] * daph.num_values
        else:
            repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Repetition levels: %s ...", repetition_levels[0:10])

    definition_levels = []
    max_definition_level = schema_helper.max_definition_level(column_path_name)
    logger.debug("  max_definition_level: %s", max_definition_level)
    if max_definition_level > 0:
        bit_width = encoding.width_from_max_int(max_definition_level)
        logger.debug(
            "  max def level: %s   bit_width: %s  values: %s", max_definition_level, bit_width, daph.num_values
        )
        definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Definition levels: %s ...", definition_levels[0:10])

    num_nulls = 0
    for i in range(daph.num_values):
        if len(definition_levels) > 0:
            dl = definition_levels[i]
            if dl < max_definition_level:
                num_nulls += 1
    num_not_null = daph.num_values - num_nulls
    if daph.encoding == Encoding.PLAIN:
        for i in range(num_not_null):
            vals.append(encoding.read_plain(io_obj, column_metadata.type, None))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        while total_seen < num_not_null:
            values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0 : daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    elif daph.encoding == Encoding.DELTA_BYTE_ARRAY:
        vals = encoding.read_delta_byte_array(io_obj)
    elif daph.encoding == Encoding.DELTA_BINARY_PACKED:
        vals = encoding.read_delta_binary_packed(io_obj)
    else:
        raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding))
    return (repetition_levels, definition_levels, vals)
Esempio n. 2
0
def read_data_page(fo, schema_helper, page_header, column_metadata, dictionary):
    """Reads the datapage from the given file-like object based upon the
    metadata in the schema_helper, page_header, column_metadata, and
    (optional) dictionary. Returns a list of values.
    """
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []

    logger.debug("  path_in_schema: %s", ".".join(column_metadata.path_in_schema))
    logger.debug("  definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding))
    logger.debug("  repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding))
    logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    max_repetition_level = schema_helper.max_repetition_level(column_metadata.path_in_schema)
    logger.debug("  max_repetition_level: %s", max_repetition_level)
    if max_repetition_level > 0:
        bit_width = encoding.width_from_max_int(max_repetition_level)
        if bit_width == 0:
            repetition_levels = [0] * daph.num_values
        else:
            repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Repetition levels: %s", repetition_levels)

    definition_levels = []
    max_definition_level = schema_helper.max_definition_level(column_metadata.path_in_schema)
    logger.debug("  max_definition_level: %s", max_definition_level)
    if max_definition_level > 0:
        bit_width = encoding.width_from_max_int(max_definition_level)
        logger.debug(
            "  max def level: %s   bit_width: %s  values: %s", max_definition_level, bit_width, daph.num_values
        )
        definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Definition levels: %s", definition_levels)

    num_nulls = 0
    for i in range(daph.num_values):
        if len(definition_levels) > 0:
            dl = definition_levels[i]
            if dl < max_definition_level:
                num_nulls += 1
    num_not_null = daph.num_values - num_nulls
    if daph.encoding == Encoding.PLAIN:
        for i in range(num_not_null):
            vals.append(encoding.read_plain(io_obj, column_metadata.type, None))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        # TODO jcrobak -- not sure that this loop is needed?
        while total_seen < num_not_null:
            values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0 : daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    elif daph.encoding == Encoding.DELTA_BYTE_ARRAY:
        vals = encoding.read_delta_byte_array(io_obj)
    elif daph.encoding == Encoding.DELTA_BINARY_PACKED:
        vals = encoding.read_delta_binary_packed(io_obj)
    else:
        raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding))
    if len(definition_levels) > 0:
        null_mixed = []
        idx2 = 0
        for i in range(daph.num_values):
            dl = definition_levels[i]
            if dl < max_definition_level:
                null_mixed.append(None)
            else:
                null_mixed.append(vals[idx2])
                idx2 += 1
        return null_mixed
    else:
        return vals