def read_index(sections, idx, codec): table, cncx = OrderedDict(), CNCX([], codec) data = sections[idx].raw indx_header = parse_indx_header(data) indx_count = indx_header['count'] if indx_header['ncncx'] > 0: off = idx + indx_count + 1 cncx_records = [x.raw for x in sections[off:off+indx_header['ncncx']]] cncx = CNCX(cncx_records, codec) tag_section_start = indx_header['tagx'] control_byte_count, tags = parse_tagx_section(data[tag_section_start:]) read_variable_len_data(data, indx_header) index_headers = [] for i in range(idx + 1, idx + 1 + indx_count): # Index record data = sections[i].raw index_headers.append(parse_index_record(table, data, control_byte_count, tags, codec, indx_header['ordt_map'], strict=True)) read_variable_len_data(data, index_headers[-1]) return table, cncx, indx_header, index_headers
def __init__(self, record): self.record = record raw = self.record.raw # open('/t/index_header.bin', 'wb').write(raw) if raw[:4] != b'INDX': raise ValueError('Invalid Primary Index Record') self.header_length, = struct.unpack('>I', raw[4:8]) self.unknown1 = raw[8:12] self.header_type, = struct.unpack('>I', raw[12:16]) self.index_type, = struct.unpack('>I', raw[16:20]) self.index_type_desc = {0: 'normal', 2: 'inflection', 6: 'calibre'}.get(self.index_type, 'unknown') self.idxt_start, = struct.unpack('>I', raw[20:24]) self.index_count, = struct.unpack('>I', raw[24:28]) self.index_encoding_num, = struct.unpack('>I', raw[28:32]) self.index_encoding = {65001: 'utf-8', 1252: 'cp1252'}.get(self.index_encoding_num, 'unknown') if self.index_encoding == 'unknown': raise ValueError( 'Unknown index encoding: %d'%self.index_encoding_num) self.possibly_language = raw[32:36] self.num_index_entries, = struct.unpack('>I', raw[36:40]) self.ordt_start, = struct.unpack('>I', raw[40:44]) self.ligt_start, = struct.unpack('>I', raw[44:48]) self.num_of_ligt_entries, = struct.unpack('>I', raw[48:52]) self.num_of_cncx_blocks, = struct.unpack('>I', raw[52:56]) self.unknown2 = raw[56:180] self.tagx_offset, = struct.unpack(b'>I', raw[180:184]) if self.tagx_offset != self.header_length: raise ValueError('TAGX offset and header length disagree') self.unknown3 = raw[184:self.header_length] tagx = raw[self.header_length:] if not tagx.startswith(b'TAGX'): raise ValueError('Invalid TAGX section') self.tagx_header_length, = struct.unpack('>I', tagx[4:8]) self.tagx_control_byte_count, = struct.unpack('>I', tagx[8:12]) self.tagx_entries = [TagX(*x) for x in parse_tagx_section(tagx)[1]] if self.tagx_entries and not self.tagx_entries[-1].is_eof: raise ValueError('TAGX last entry is not EOF') idxt0_pos = self.header_length+self.tagx_header_length last_num, consumed = decode_hex_number(raw[idxt0_pos:]) count_pos = idxt0_pos + consumed self.ncx_count, = struct.unpack(b'>H', raw[count_pos:count_pos+2]) self.last_entry = last_num if last_num != self.ncx_count - 1: raise ValueError('Last id number in the NCX != NCX count - 1') # There may be some alignment zero bytes between the end of the idxt0 # and self.idxt_start idxt = raw[self.idxt_start:] if idxt[:4] != b'IDXT': raise ValueError('Invalid IDXT header') length_check, = struct.unpack(b'>H', idxt[4:6]) if length_check != self.header_length + self.tagx_header_length: raise ValueError('Length check failed')