def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The inflection index data. @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) for value in groupList: offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) if value + 1 < entryCount: nextOffset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * (value + 1))) else: nextOffset = None # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = self.getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue(inflectionNames, value) inflectionName = inflectionNames[value+consumed:value+consumed+textLength] # Get and apply inflection rule. value = tagMap[0x1a][i] offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) if inflection != None: result += ' <idx:iform name="%s" value="%s"/>' % (inflectionName, inflection) result += "</idx:infl>" return result
def readCTOC(self, txtdata): # read all blocks from CTOC ctoc_data = {} offset = 0 while offset<len(txtdata): if txtdata[offset] == '\0': break idx_offs = offset #first n bytes: name len as vwi pos, ilen = getVariableWidthValue(txtdata, offset) offset += pos #<len> next bytes: name name = txtdata[offset:offset+ilen] offset += ilen if DEBUG: print "name length is ", ilen print idx_offs, name ctoc_data[idx_offs] = name return ctoc_data
def getTagMap(self, controlByteCount, tagTable, entryData, startPos, endPos): ''' Create a map of tags and values from the given byte section. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param entryData: The data to process. @param startPos: The starting position in entryData. @param endPos: The end position in entryData or None if it is unknown. @return: Hashmap of tag and list of values. ''' tags = [] tagHashMap = {} controlByteIndex = 0 dataStart = startPos + controlByteCount for tag, valuesPerEntry, mask, endFlag in tagTable: if endFlag == 0x01: controlByteIndex += 1 continue value = ord(entryData[startPos + controlByteIndex]) & mask if value != 0: if value == mask: if self.countSetBits(mask) > 1: # If all bits of masked value are set and the mask has more than one bit, a variable width value # will follow after the control bytes which defines the length of bytes (NOT the value count!) # which will contain the corresponding variable width values. consumed, value = getVariableWidthValue( entryData, dataStart) dataStart += consumed tags.append((tag, None, value, valuesPerEntry)) else: tags.append((tag, 1, None, valuesPerEntry)) else: # Shift bits to get the masked value. while mask & 0x01 == 0: mask = mask >> 1 value = value >> 1 tags.append((tag, value, None, valuesPerEntry)) for tag, valueCount, valueBytes, valuesPerEntry in tags: values = [] if valueCount != None: # Read valueCount * valuesPerEntry variable width values. for _ in range(valueCount): for _ in range(valuesPerEntry): consumed, data = getVariableWidthValue( entryData, dataStart) dataStart += consumed values.append(data) else: # Convert valueBytes to variable width values. totalConsumed = 0 while totalConsumed < valueBytes: # Does this work for valuesPerEntry != 1? consumed, data = getVariableWidthValue( entryData, dataStart) dataStart += consumed totalConsumed += consumed values.append(data) if totalConsumed != valueBytes: print "Error: Should consume %s bytes, but consumed %s" % ( valueBytes, totalConsumed) tagHashMap[tag] = values # Test that all bytes have been processed if endPos is given. if endPos is not None and dataStart != endPos: # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. for char in entryData[dataStart:endPos]: if char != chr(0x00): print "Warning: There are unprocessed index bytes left: %s" % toHex( entryData[dataStart:endPos]) if DEBUG_DICT: print "controlByteCount: %s" % controlByteCount print "tagTable: %s" % tagTable print "data: %s" % toHex(entryData[startPos:endPos]) print "tagHashMap: %s" % tagHashMap break return tagHashMap
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The inflection index data. @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) for value in groupList: offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) if value + 1 < entryCount: nextOffset, = struct.unpack_from( '>H', data, idxtPos + 4 + (2 * (value + 1))) else: nextOffset = None # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = self.getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue( inflectionNames, value) inflectionName = inflectionNames[value + consumed:value + consumed + textLength] # Get and apply inflection rule. value = tagMap[0x1a][i] offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset + 1, offset + 1 + textLength) if inflection != None: result += ' <idx:iform name="%s" value="%s"/>' % ( inflectionName, inflection) result += "</idx:infl>" return result
def getTagMap(self, controlByteCount, tagTable, entryData, startPos, endPos): ''' Create a map of tags and values from the given byte section. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param entryData: The data to process. @param startPos: The starting position in entryData. @param endPos: The end position in entryData or None if it is unknown. @return: Hashmap of tag and list of values. ''' tags = [] tagHashMap = {} controlByteIndex = 0 dataStart = startPos + controlByteCount for tag, valuesPerEntry, mask, endFlag in tagTable: if endFlag == 0x01: controlByteIndex += 1 continue value = ord(entryData[startPos + controlByteIndex]) & mask if value != 0: if value == mask: if self.countSetBits(mask) > 1: # If all bits of masked value are set and the mask has more than one bit, a variable width value # will follow after the control bytes which defines the length of bytes (NOT the value count!) # which will contain the corresponding variable width values. consumed, value = getVariableWidthValue(entryData, dataStart) dataStart += consumed tags.append((tag, None, value, valuesPerEntry)) else: tags.append((tag, 1, None, valuesPerEntry)) else: # Shift bits to get the masked value. while mask & 0x01 == 0: mask = mask >> 1 value = value >> 1 tags.append((tag, value, None, valuesPerEntry)) for tag, valueCount, valueBytes, valuesPerEntry in tags: values = [] if valueCount != None: # Read valueCount * valuesPerEntry variable width values. for _ in range(valueCount): for _ in range(valuesPerEntry): consumed, data = getVariableWidthValue(entryData, dataStart) dataStart += consumed values.append(data) else: # Convert valueBytes to variable width values. totalConsumed = 0 while totalConsumed < valueBytes: # Does this work for valuesPerEntry != 1? consumed, data = getVariableWidthValue(entryData, dataStart) dataStart += consumed totalConsumed += consumed values.append(data) if totalConsumed != valueBytes: print "Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed) tagHashMap[tag] = values # Test that all bytes have been processed if endPos is given. if endPos is not None and dataStart != endPos: # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. for char in entryData[dataStart:endPos]: if char != chr(0x00): print "Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]) if DEBUG_DICT: print "controlByteCount: %s" % controlByteCount print "tagTable: %s" % tagTable print "data: %s" % toHex(entryData[startPos:endPos]) print "tagHashMap: %s" % tagHashMap break return tagHashMap