Ejemplo n.º 1
0
 def getIndexData(self, idx):
     sect = self.sect
     outtbl = []
     ctoc_text = {}
     if idx != 0xffffffff:
         data = sect.loadSection(idx)
         idxhdr = self.parseINDXHeader(data)
         IndexCount = idxhdr['count']
         # handle the case of multiple sections used for CTOC
         rec_off = 0
         off = idx + IndexCount + 1
         for j in range(idxhdr['nctoc']):
             cdata = sect.loadSection(off + j)
             ctocdict = self.readCTOC(cdata)
             for k in ctocdict.keys():
                 ctoc_text[k + rec_off] = ctocdict[k]
             rec_off += 0x10000
         tagSectionStart = idxhdr['len']
         controlByteCount, tagTable = readTagSection(tagSectionStart, data)
         if DEBUG:
             print "IndexCount is", IndexCount
             print "TagTable: %s" % tagTable
         for i in range(idx + 1, idx + 1 + IndexCount):
             data = sect.loadSection(i)
             hdrinfo = self.parseINDXHeader(data)
             idxtPos = hdrinfo['start']
             entryCount = hdrinfo['count']
             if DEBUG:
                 print idxtPos, entryCount
             # loop through to build up the IDXT position starts
             idxPositions = []
             for j in range(entryCount):
                 pos, = struct.unpack_from('>H', data,
                                           idxtPos + 4 + (2 * j))
                 idxPositions.append(pos)
             # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
             idxPositions.append(idxtPos)
             # for each entry in the IDXT build up the tagMap and any associated text
             for j in range(entryCount):
                 startPos = idxPositions[j]
                 endPos = idxPositions[j + 1]
                 textLength = ord(data[startPos])
                 text = data[startPos + 1:startPos + 1 + textLength]
                 tagMap = self.getTagMap(controlByteCount, tagTable, data,
                                         startPos + 1 + textLength, endPos)
                 outtbl.append([text, tagMap])
                 if DEBUG:
                     print tagMap
                     print text
     return outtbl, ctoc_text
Ejemplo n.º 2
0
 def getIndexData(self, idx):
     sect = self.sect
     outtbl = []
     ctoc_text = {}
     if idx != 0xffffffff:
         data = sect.loadSection(idx)
         idxhdr = self.parseINDXHeader(data)
         IndexCount = idxhdr['count']
         # handle the case of multiple sections used for CTOC
         rec_off = 0
         off = idx + IndexCount + 1
         for j in range(idxhdr['nctoc']):
             cdata = sect.loadSection(off + j)
             ctocdict = self.readCTOC(cdata)
             for k in ctocdict.keys():
                 ctoc_text[k + rec_off] = ctocdict[k]
             rec_off += 0x10000
         tagSectionStart = idxhdr['len']
         controlByteCount, tagTable = readTagSection(tagSectionStart, data)
         if DEBUG:
             print "IndexCount is", IndexCount
             print "TagTable: %s" % tagTable
         for i in range(idx + 1, idx + 1 + IndexCount):
             data = sect.loadSection(i)
             hdrinfo = self.parseINDXHeader(data)
             idxtPos = hdrinfo['start']
             entryCount = hdrinfo['count']
             if DEBUG:
                 print idxtPos, entryCount
             # loop through to build up the IDXT position starts
             idxPositions = []
             for j in range(entryCount):
                 pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j))
                 idxPositions.append(pos)
             # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
             idxPositions.append(idxtPos)
             # for each entry in the IDXT build up the tagMap and any associated text
             for j in range(entryCount):
                 startPos = idxPositions[j]
                 endPos = idxPositions[j+1]
                 textLength = ord(data[startPos])
                 text = data[startPos+1:startPos+1+textLength]
                 tagMap = self.getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                 outtbl.append([text, tagMap])
                 if DEBUG:
                     print tagMap
                     print text
     return outtbl, ctoc_text
Ejemplo n.º 3
0
    def getPositionMap(self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)
                metaIndexCount, = struct.unpack_from('>L', metaInflIndexData,
                                                     0x18)
                if metaIndexCount != 1:
                    print "Error: Dictionary contains multiple inflection index sections, which is not yet supported"
                    decodeInflection = False
                inflIndexData = sect.loadSection(metaInflIndex + 1)
                inflNameData = sect.loadSection(metaInflIndex + 1 +
                                                metaIndexCount)
                tagSectionStart, = struct.unpack_from('>L', metaInflIndexData,
                                                      0x04)
                inflectionControlByteCount, inflectionTagTable = readTagSection(
                    tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)
            tagSectionStart, = struct.unpack_from('>L', data, 0x04)
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount, = struct.unpack_from('>L', data, 0x18)
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1,
                           metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                idxtPos, = struct.unpack_from('>L', data, 0x14)
                entryCount, = struct.unpack_from('>L', data, 0x18)
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data,
                                              idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)

                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j + 1]
                    textLength = ord(data[startPos])
                    text = data[startPos + 1:startPos + 1 + textLength]
                    tagMap = self.getTagMap(controlByteCount, tagTable, data,
                                            startPos + 1 + textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(
                                text, inflectionControlByteCount,
                                inflectionTagTable, inflIndexData,
                                inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[
                                0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[
                                    entryEndPosition] = "</idx:entry>" + positionMap[
                                        entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap
Ejemplo n.º 4
0
    def getPositionMap (self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)
                metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18)
                if metaIndexCount != 1:
                    print "Error: Dictionary contains multiple inflection index sections, which is not yet supported"
                    decodeInflection = False
                inflIndexData = sect.loadSection(metaInflIndex + 1)
                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
                tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04)
                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)
            tagSectionStart, = struct.unpack_from('>L', data, 0x04)
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount, = struct.unpack_from('>L', data, 0x18)
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                idxtPos, = struct.unpack_from('>L', data, 0x14)
                entryCount, = struct.unpack_from('>L', data, 0x18)
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)

                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos])
                    text = data[startPos+1:startPos+1+textLength]
                    tagMap = self.getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap