Beispiel #1
0
def read_dictionary_page(fo, page_header, column_metadata):
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    dict_items = []
    while io_obj.tell() < len(raw_bytes):
        # TODO - length for fixed byte array
        dict_items.append(encoding.read_plain(io_obj, column_metadata.type, None))
    return dict_items
Beispiel #2
0
def read_dictionary_page(fo, page_header, column_metadata):
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    dict_items = []
    while io_obj.tell() < len(raw_bytes):
        # TODO - length for fixed byte array
        dict_items.append(
            encoding.read_plain(io_obj, column_metadata.type, None))
    return dict_items
Beispiel #3
0
def read_data_page(fo, schema_helper, page_header, column_metadata, dictionary):
    """Reads the datapage from the given file-like object based upon the
    metadata in the schema_helper, page_header, column_metadata, and
    (optional) dictionary. Returns a list of values.
    """
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []

    logger.debug("  definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding))
    logger.debug("  repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding))
    logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    # definition levels are skipped if data is required.
    if not schema_helper.is_required(column_metadata.path_in_schema[-1]):
        max_definition_level = schema_helper.max_definition_level(column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_definition_level)
        logger.debug("  max def level: %s   bit_width: %s", max_definition_level, bit_width)
        if bit_width == 0:
            definition_levels = [0] * daph.num_values
        else:
            definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width)

        logger.debug("  Definition levels: %s", len(definition_levels))

    # repetition levels are skipped if data is at the first level.
    if len(column_metadata.path_in_schema) > 1:
        max_repetition_level = schema_helper.max_repetition_level(column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_repetition_level)
        repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values)

    # TODO Actually use the definition and repetition levels.

    if daph.encoding == Encoding.PLAIN:
        for i in range(daph.num_values):
            vals.append(encoding.read_plain(io_obj, column_metadata.type, None))
        logger.debug("  Values: %s", len(vals))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        # TODO jcrobak -- not sure that this loop is needed?
        while total_seen < daph.num_values:
            values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0 : daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    else:
        raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding))
    return vals
Beispiel #4
0
def read_data_page(fo, schema_helper, page_header, column_metadata,
                   dictionary):
    """Reads the datapage from the given file-like object based upon the
    metadata in the schema_helper, page_header, column_metadata, and
    (optional) dictionary. Returns a list of values.
    """
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []
    debug_logging = logger.isEnabledFor(logging.DEBUG)

    if debug_logging:
        logger.debug("  definition_level_encoding: %s",
                     _get_name(Encoding, daph.definition_level_encoding))
        logger.debug("  repetition_level_encoding: %s",
                     _get_name(Encoding, daph.repetition_level_encoding))
        logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    # definition levels are skipped if data is required.
    if not schema_helper.is_required(column_metadata.path_in_schema[-1]):
        max_definition_level = schema_helper.max_definition_level(
            column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_definition_level)
        if debug_logging:
            logger.debug("  max def level: %s   bit_width: %s",
                         max_definition_level, bit_width)
        if bit_width == 0:
            definition_levels = [0] * daph.num_values
        else:
            definition_levels = _read_data(io_obj,
                                           daph.definition_level_encoding,
                                           daph.num_values, bit_width)

        if debug_logging:
            logger.debug("  Definition levels: %s", len(definition_levels))

    # repetition levels are skipped if data is at the first level.
    if len(column_metadata.path_in_schema) > 1:
        max_repetition_level = schema_helper.max_repetition_level(
            column_metadata.path_in_schema)
        bit_width = encoding.width_from_max_int(max_repetition_level)
        repetition_levels = _read_data(io_obj, daph.repetition_level_encoding,
                                       daph.num_values)

    # TODO Actually use the definition and repetition levels.

    if daph.encoding == Encoding.PLAIN:
        for i in range(daph.num_values):
            vals.append(encoding.read_plain(io_obj, column_metadata.type,
                                            None))
        if debug_logging:
            logger.debug("  Values: %s", len(vals))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        if debug_logging:
            logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        # TODO jcrobak -- not sure that this loop is needed?
        while total_seen < daph.num_values:
            values = encoding.read_rle_bit_packed_hybrid(
                dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0:daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    else:
        raise ParquetFormatException("Unsupported encoding: %s",
                                     _get_name(Encoding, daph.encoding))
    return vals
Beispiel #5
0
def _read_data_page(fo, schema_helper, page_header, column_metadata, dictionary):
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []
    column_path_name = ".".join(column_metadata.path_in_schema)
    logger.debug("  path_in_schema: %s", column_path_name)
    logger.debug("  definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding))
    logger.debug("  repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding))
    logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    max_repetition_level = schema_helper.max_repetition_level(column_path_name)
    logger.debug("  max_repetition_level: %s", max_repetition_level)
    repetition_levels = []
    if max_repetition_level > 0:
        bit_width = encoding.width_from_max_int(max_repetition_level)
        if bit_width == 0:
            repetition_levels = [0] * daph.num_values
        else:
            repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Repetition levels: %s ...", repetition_levels[0:10])

    definition_levels = []
    max_definition_level = schema_helper.max_definition_level(column_path_name)
    logger.debug("  max_definition_level: %s", max_definition_level)
    if max_definition_level > 0:
        bit_width = encoding.width_from_max_int(max_definition_level)
        logger.debug(
            "  max def level: %s   bit_width: %s  values: %s", max_definition_level, bit_width, daph.num_values
        )
        definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Definition levels: %s ...", definition_levels[0:10])

    num_nulls = 0
    for i in range(daph.num_values):
        if len(definition_levels) > 0:
            dl = definition_levels[i]
            if dl < max_definition_level:
                num_nulls += 1
    num_not_null = daph.num_values - num_nulls
    if daph.encoding == Encoding.PLAIN:
        for i in range(num_not_null):
            vals.append(encoding.read_plain(io_obj, column_metadata.type, None))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        while total_seen < num_not_null:
            values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0 : daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    elif daph.encoding == Encoding.DELTA_BYTE_ARRAY:
        vals = encoding.read_delta_byte_array(io_obj)
    elif daph.encoding == Encoding.DELTA_BINARY_PACKED:
        vals = encoding.read_delta_binary_packed(io_obj)
    else:
        raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding))
    return (repetition_levels, definition_levels, vals)
Beispiel #6
0
def read_data_page(fo, schema_helper, page_header, column_metadata, dictionary):
    """Reads the datapage from the given file-like object based upon the
    metadata in the schema_helper, page_header, column_metadata, and
    (optional) dictionary. Returns a list of values.
    """
    daph = page_header.data_page_header
    raw_bytes = _read_page(fo, page_header, column_metadata)
    io_obj = cStringIO.StringIO(raw_bytes)
    vals = []

    logger.debug("  path_in_schema: %s", ".".join(column_metadata.path_in_schema))
    logger.debug("  definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding))
    logger.debug("  repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding))
    logger.debug("  encoding: %s", _get_name(Encoding, daph.encoding))

    max_repetition_level = schema_helper.max_repetition_level(column_metadata.path_in_schema)
    logger.debug("  max_repetition_level: %s", max_repetition_level)
    if max_repetition_level > 0:
        bit_width = encoding.width_from_max_int(max_repetition_level)
        if bit_width == 0:
            repetition_levels = [0] * daph.num_values
        else:
            repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Repetition levels: %s", repetition_levels)

    definition_levels = []
    max_definition_level = schema_helper.max_definition_level(column_metadata.path_in_schema)
    logger.debug("  max_definition_level: %s", max_definition_level)
    if max_definition_level > 0:
        bit_width = encoding.width_from_max_int(max_definition_level)
        logger.debug(
            "  max def level: %s   bit_width: %s  values: %s", max_definition_level, bit_width, daph.num_values
        )
        definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width)
        logger.debug("  Definition levels: %s", definition_levels)

    num_nulls = 0
    for i in range(daph.num_values):
        if len(definition_levels) > 0:
            dl = definition_levels[i]
            if dl < max_definition_level:
                num_nulls += 1
    num_not_null = daph.num_values - num_nulls
    if daph.encoding == Encoding.PLAIN:
        for i in range(num_not_null):
            vals.append(encoding.read_plain(io_obj, column_metadata.type, None))
    elif daph.encoding == Encoding.PLAIN_DICTIONARY:
        # bit_width is stored as single byte.
        bit_width = struct.unpack("<B", io_obj.read(1))[0]
        logger.debug("bit_width: %d", bit_width)
        total_seen = 0
        dict_values_bytes = io_obj.read()
        dict_values_io_obj = cStringIO.StringIO(dict_values_bytes)
        # TODO jcrobak -- not sure that this loop is needed?
        while total_seen < num_not_null:
            values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes))
            if len(values) + total_seen > daph.num_values:
                values = values[0 : daph.num_values - total_seen]
            vals += [dictionary[v] for v in values]
            total_seen += len(values)
    elif daph.encoding == Encoding.DELTA_BYTE_ARRAY:
        vals = encoding.read_delta_byte_array(io_obj)
    elif daph.encoding == Encoding.DELTA_BINARY_PACKED:
        vals = encoding.read_delta_binary_packed(io_obj)
    else:
        raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding))
    if len(definition_levels) > 0:
        null_mixed = []
        idx2 = 0
        for i in range(daph.num_values):
            dl = definition_levels[i]
            if dl < max_definition_level:
                null_mixed.append(None)
            else:
                null_mixed.append(vals[idx2])
                idx2 += 1
        return null_mixed
    else:
        return vals