コード例 #1
0
ファイル: mobi_dict.py プロジェクト: mlitwin/hesperian-tools
    def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data, inflectionNames, groupList):
        '''
        Create string which contains the inflection groups with inflection rules as mobipocket tags.

        @param mainEntry: The word to inflect.
        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param data: The inflection index data.
        @param inflectionNames: The inflection rule name data.
        @param groupList: The list of inflection groups to process.
        @return: String with inflection groups and rules or empty string if required tags are not available.
        '''
        result = ""
        idxtPos, = struct.unpack_from('>L', data, 0x14)
        entryCount, = struct.unpack_from('>L', data, 0x18)
        for value in groupList:
            offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value))
            if value + 1 < entryCount:
                nextOffset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * (value + 1)))
            else:
                nextOffset = None

            # First byte seems to be always 0x00 and must be skipped.
            assert ord(data[offset]) == 0x00
            tagMap = self.getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)

            # Make sure that the required tags are available.
            if 0x05 not in tagMap:
                print "Error: Required tag 0x05 not found in tagMap"
                return ""
            if 0x1a not in tagMap:
                print "Error: Required tag 0x1a not found in tagMap"
                return ""

            result += "<idx:infl>"

            for i in range(len(tagMap[0x05])):
                # Get name of inflection rule.
                value = tagMap[0x05][i]
                consumed, textLength = getVariableWidthValue(inflectionNames, value)
                inflectionName = inflectionNames[value+consumed:value+consumed+textLength]

                # Get and apply inflection rule.
                value = tagMap[0x1a][i]
                offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value))
                textLength = ord(data[offset])
                inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
                if inflection != None:
                    result += '  <idx:iform name="%s" value="%s"/>' % (inflectionName, inflection)

            result += "</idx:infl>"
        return result
コード例 #2
0
 def readCTOC(self, txtdata):
     # read all blocks from CTOC
     ctoc_data = {}
     offset = 0
     while offset<len(txtdata):
         if txtdata[offset] == '\0':
             break
         idx_offs = offset
         #first n bytes: name len as vwi
         pos, ilen = getVariableWidthValue(txtdata, offset)
         offset += pos
         #<len> next bytes: name
         name = txtdata[offset:offset+ilen]
         offset += ilen
         if DEBUG:
             print "name length is ", ilen
             print idx_offs, name
         ctoc_data[idx_offs] = name
     return ctoc_data
コード例 #3
0
    def getTagMap(self, controlByteCount, tagTable, entryData, startPos,
                  endPos):
        '''
        Create a map of tags and values from the given byte section.

        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param entryData: The data to process.
        @param startPos: The starting position in entryData.
        @param endPos: The end position in entryData or None if it is unknown.
        @return: Hashmap of tag and list of values.
        '''
        tags = []
        tagHashMap = {}
        controlByteIndex = 0
        dataStart = startPos + controlByteCount

        for tag, valuesPerEntry, mask, endFlag in tagTable:
            if endFlag == 0x01:
                controlByteIndex += 1
                continue

            value = ord(entryData[startPos + controlByteIndex]) & mask

            if value != 0:
                if value == mask:
                    if self.countSetBits(mask) > 1:
                        # If all bits of masked value are set and the mask has more than one bit, a variable width value
                        # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                        # which will contain the corresponding variable width values.
                        consumed, value = getVariableWidthValue(
                            entryData, dataStart)
                        dataStart += consumed
                        tags.append((tag, None, value, valuesPerEntry))
                    else:
                        tags.append((tag, 1, None, valuesPerEntry))
                else:
                    # Shift bits to get the masked value.
                    while mask & 0x01 == 0:
                        mask = mask >> 1
                        value = value >> 1
                    tags.append((tag, value, None, valuesPerEntry))

        for tag, valueCount, valueBytes, valuesPerEntry in tags:
            values = []
            if valueCount != None:
                # Read valueCount * valuesPerEntry variable width values.
                for _ in range(valueCount):
                    for _ in range(valuesPerEntry):
                        consumed, data = getVariableWidthValue(
                            entryData, dataStart)
                        dataStart += consumed
                        values.append(data)
            else:
                # Convert valueBytes to variable width values.
                totalConsumed = 0
                while totalConsumed < valueBytes:
                    # Does this work for valuesPerEntry != 1?
                    consumed, data = getVariableWidthValue(
                        entryData, dataStart)
                    dataStart += consumed
                    totalConsumed += consumed
                    values.append(data)
                if totalConsumed != valueBytes:
                    print "Error: Should consume %s bytes, but consumed %s" % (
                        valueBytes, totalConsumed)
            tagHashMap[tag] = values

        # Test that all bytes have been processed if endPos is given.
        if endPos is not None and dataStart != endPos:
            # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
            for char in entryData[dataStart:endPos]:
                if char != chr(0x00):
                    print "Warning: There are unprocessed index bytes left: %s" % toHex(
                        entryData[dataStart:endPos])
                    if DEBUG_DICT:
                        print "controlByteCount: %s" % controlByteCount
                        print "tagTable: %s" % tagTable
                        print "data: %s" % toHex(entryData[startPos:endPos])
                        print "tagHashMap: %s" % tagHashMap
                    break

        return tagHashMap
コード例 #4
0
    def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data,
                            inflectionNames, groupList):
        '''
        Create string which contains the inflection groups with inflection rules as mobipocket tags.

        @param mainEntry: The word to inflect.
        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param data: The inflection index data.
        @param inflectionNames: The inflection rule name data.
        @param groupList: The list of inflection groups to process.
        @return: String with inflection groups and rules or empty string if required tags are not available.
        '''
        result = ""
        idxtPos, = struct.unpack_from('>L', data, 0x14)
        entryCount, = struct.unpack_from('>L', data, 0x18)
        for value in groupList:
            offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value))
            if value + 1 < entryCount:
                nextOffset, = struct.unpack_from(
                    '>H', data, idxtPos + 4 + (2 * (value + 1)))
            else:
                nextOffset = None

            # First byte seems to be always 0x00 and must be skipped.
            assert ord(data[offset]) == 0x00
            tagMap = self.getTagMap(controlByteCount, tagTable, data,
                                    offset + 1, nextOffset)

            # Make sure that the required tags are available.
            if 0x05 not in tagMap:
                print "Error: Required tag 0x05 not found in tagMap"
                return ""
            if 0x1a not in tagMap:
                print "Error: Required tag 0x1a not found in tagMap"
                return ""

            result += "<idx:infl>"

            for i in range(len(tagMap[0x05])):
                # Get name of inflection rule.
                value = tagMap[0x05][i]
                consumed, textLength = getVariableWidthValue(
                    inflectionNames, value)
                inflectionName = inflectionNames[value + consumed:value +
                                                 consumed + textLength]

                # Get and apply inflection rule.
                value = tagMap[0x1a][i]
                offset, = struct.unpack_from('>H', data,
                                             idxtPos + 4 + (2 * value))
                textLength = ord(data[offset])
                inflection = self.applyInflectionRule(mainEntry, data,
                                                      offset + 1,
                                                      offset + 1 + textLength)
                if inflection != None:
                    result += '  <idx:iform name="%s" value="%s"/>' % (
                        inflectionName, inflection)

            result += "</idx:infl>"
        return result
コード例 #5
0
ファイル: mobi_dict.py プロジェクト: mlitwin/hesperian-tools
    def getTagMap(self, controlByteCount, tagTable, entryData, startPos, endPos):
        '''
        Create a map of tags and values from the given byte section.

        @param controlByteCount: The number of control bytes.
        @param tagTable: The tag table.
        @param entryData: The data to process.
        @param startPos: The starting position in entryData.
        @param endPos: The end position in entryData or None if it is unknown.
        @return: Hashmap of tag and list of values.
        '''
        tags = []
        tagHashMap = {}
        controlByteIndex = 0
        dataStart = startPos + controlByteCount

        for tag, valuesPerEntry, mask, endFlag in tagTable:
            if endFlag == 0x01:
                controlByteIndex += 1
                continue

            value = ord(entryData[startPos + controlByteIndex]) & mask

            if value != 0:
                if value == mask:
                    if self.countSetBits(mask) > 1:
                        # If all bits of masked value are set and the mask has more than one bit, a variable width value
                        # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                        # which will contain the corresponding variable width values.
                        consumed, value = getVariableWidthValue(entryData, dataStart)
                        dataStart += consumed
                        tags.append((tag, None, value, valuesPerEntry))
                    else:
                        tags.append((tag, 1, None, valuesPerEntry))
                else:
                    # Shift bits to get the masked value.
                    while mask & 0x01 == 0:
                        mask = mask >> 1
                        value = value >> 1
                    tags.append((tag, value, None, valuesPerEntry))

        for tag, valueCount, valueBytes, valuesPerEntry in tags:
            values = []
            if valueCount != None:
                # Read valueCount * valuesPerEntry variable width values.
                for _ in range(valueCount):
                    for _ in range(valuesPerEntry):
                        consumed, data = getVariableWidthValue(entryData, dataStart)
                        dataStart += consumed
                        values.append(data)
            else:
                # Convert valueBytes to variable width values.
                totalConsumed = 0
                while totalConsumed < valueBytes:
                    # Does this work for valuesPerEntry != 1?
                    consumed, data = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    totalConsumed += consumed
                    values.append(data)
                if totalConsumed != valueBytes:
                    print "Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)
            tagHashMap[tag] = values

        # Test that all bytes have been processed if endPos is given.
        if endPos is not None and dataStart != endPos:
            # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
            for char in entryData[dataStart:endPos]:
                if char != chr(0x00):
                    print "Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])
                    if DEBUG_DICT:
                        print "controlByteCount: %s" % controlByteCount
                        print "tagTable: %s" % tagTable
                        print "data: %s" % toHex(entryData[startPos:endPos])
                        print "tagHashMap: %s" % tagHashMap
                    break

        return tagHashMap