def _read_data_page(fo, schema_helper, page_header, column_metadata, dictionary): daph = page_header.data_page_header raw_bytes = _read_page(fo, page_header, column_metadata) io_obj = cStringIO.StringIO(raw_bytes) vals = [] column_path_name = ".".join(column_metadata.path_in_schema) logger.debug(" path_in_schema: %s", column_path_name) logger.debug(" definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding)) logger.debug(" repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding)) logger.debug(" encoding: %s", _get_name(Encoding, daph.encoding)) max_repetition_level = schema_helper.max_repetition_level(column_path_name) logger.debug(" max_repetition_level: %s", max_repetition_level) repetition_levels = [] if max_repetition_level > 0: bit_width = encoding.width_from_max_int(max_repetition_level) if bit_width == 0: repetition_levels = [0] * daph.num_values else: repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width) logger.debug(" Repetition levels: %s ...", repetition_levels[0:10]) definition_levels = [] max_definition_level = schema_helper.max_definition_level(column_path_name) logger.debug(" max_definition_level: %s", max_definition_level) if max_definition_level > 0: bit_width = encoding.width_from_max_int(max_definition_level) logger.debug( " max def level: %s bit_width: %s values: %s", max_definition_level, bit_width, daph.num_values ) definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width) logger.debug(" Definition levels: %s ...", definition_levels[0:10]) num_nulls = 0 for i in range(daph.num_values): if len(definition_levels) > 0: dl = definition_levels[i] if dl < max_definition_level: num_nulls += 1 num_not_null = daph.num_values - num_nulls if daph.encoding == Encoding.PLAIN: for i in range(num_not_null): vals.append(encoding.read_plain(io_obj, column_metadata.type, None)) elif daph.encoding == Encoding.PLAIN_DICTIONARY: # bit_width is stored as single byte. bit_width = struct.unpack("<B", io_obj.read(1))[0] logger.debug("bit_width: %d", bit_width) total_seen = 0 dict_values_bytes = io_obj.read() dict_values_io_obj = cStringIO.StringIO(dict_values_bytes) while total_seen < num_not_null: values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes)) if len(values) + total_seen > daph.num_values: values = values[0 : daph.num_values - total_seen] vals += [dictionary[v] for v in values] total_seen += len(values) elif daph.encoding == Encoding.DELTA_BYTE_ARRAY: vals = encoding.read_delta_byte_array(io_obj) elif daph.encoding == Encoding.DELTA_BINARY_PACKED: vals = encoding.read_delta_binary_packed(io_obj) else: raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding)) return (repetition_levels, definition_levels, vals)
def read_data_page(fo, schema_helper, page_header, column_metadata, dictionary): """Reads the datapage from the given file-like object based upon the metadata in the schema_helper, page_header, column_metadata, and (optional) dictionary. Returns a list of values. """ daph = page_header.data_page_header raw_bytes = _read_page(fo, page_header, column_metadata) io_obj = cStringIO.StringIO(raw_bytes) vals = [] logger.debug(" path_in_schema: %s", ".".join(column_metadata.path_in_schema)) logger.debug(" definition_level_encoding: %s", _get_name(Encoding, daph.definition_level_encoding)) logger.debug(" repetition_level_encoding: %s", _get_name(Encoding, daph.repetition_level_encoding)) logger.debug(" encoding: %s", _get_name(Encoding, daph.encoding)) max_repetition_level = schema_helper.max_repetition_level(column_metadata.path_in_schema) logger.debug(" max_repetition_level: %s", max_repetition_level) if max_repetition_level > 0: bit_width = encoding.width_from_max_int(max_repetition_level) if bit_width == 0: repetition_levels = [0] * daph.num_values else: repetition_levels = _read_data(io_obj, daph.repetition_level_encoding, daph.num_values, bit_width) logger.debug(" Repetition levels: %s", repetition_levels) definition_levels = [] max_definition_level = schema_helper.max_definition_level(column_metadata.path_in_schema) logger.debug(" max_definition_level: %s", max_definition_level) if max_definition_level > 0: bit_width = encoding.width_from_max_int(max_definition_level) logger.debug( " max def level: %s bit_width: %s values: %s", max_definition_level, bit_width, daph.num_values ) definition_levels = _read_data(io_obj, daph.definition_level_encoding, daph.num_values, bit_width) logger.debug(" Definition levels: %s", definition_levels) num_nulls = 0 for i in range(daph.num_values): if len(definition_levels) > 0: dl = definition_levels[i] if dl < max_definition_level: num_nulls += 1 num_not_null = daph.num_values - num_nulls if daph.encoding == Encoding.PLAIN: for i in range(num_not_null): vals.append(encoding.read_plain(io_obj, column_metadata.type, None)) elif daph.encoding == Encoding.PLAIN_DICTIONARY: # bit_width is stored as single byte. bit_width = struct.unpack("<B", io_obj.read(1))[0] logger.debug("bit_width: %d", bit_width) total_seen = 0 dict_values_bytes = io_obj.read() dict_values_io_obj = cStringIO.StringIO(dict_values_bytes) # TODO jcrobak -- not sure that this loop is needed? while total_seen < num_not_null: values = encoding.read_rle_bit_packed_hybrid(dict_values_io_obj, bit_width, len(dict_values_bytes)) if len(values) + total_seen > daph.num_values: values = values[0 : daph.num_values - total_seen] vals += [dictionary[v] for v in values] total_seen += len(values) elif daph.encoding == Encoding.DELTA_BYTE_ARRAY: vals = encoding.read_delta_byte_array(io_obj) elif daph.encoding == Encoding.DELTA_BINARY_PACKED: vals = encoding.read_delta_binary_packed(io_obj) else: raise ParquetFormatException("Unsupported encoding: %s", _get_name(Encoding, daph.encoding)) if len(definition_levels) > 0: null_mixed = [] idx2 = 0 for i in range(daph.num_values): dl = definition_levels[i] if dl < max_definition_level: null_mixed.append(None) else: null_mixed.append(vals[idx2]) idx2 += 1 return null_mixed else: return vals