def getPositionMap(self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) print "\nParsing metaInflIndexData" midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr['count'] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr['len'] inflectionControlByteCount, inflectionTagTable = readTagSection( tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) print "\nParsing metaOrthIndex" idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr['len'] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr['count'] print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable if hordt2 is not None: print "orth entry uses ordt2 lookup table of type ", idxhdr[ 'otype'] hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo['start'] entryCount = hdrinfo['count'] idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j + 1] textLength = ord(data[startPos]) text = data[startPos + 1:startPos + 1 + textLength] if hordt2 is not None: utext = u"" if idxhdr['otype'] == 0: pattern = '>H' inc = 2 else: pattern = '>B' inc = 1 pos = 0 while pos < textLength: off, = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode('utf-8') tagMap = getTagMap(controlByteCount, tagTable, data, startPos + 1 + textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups( text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[ 0x02][0] if entryEndPosition in positionMap: positionMap[ entryEndPosition] = "</idx:entry>" + positionMap[ entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getPositionMap(self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18) if metaIndexCount != 1: print "Error: Dictionary contains multiple inflection index sections, which is not yet supported" decodeInflection = False inflIndexData = sect.loadSection(metaInflIndex + 1) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04) inflectionControlByteCount, inflectionTagTable = readTagSection( tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) tagSectionStart, = struct.unpack_from('>L', data, 0x04) controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount, = struct.unpack_from('>L', data, 0x18) print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j + 1] textLength = ord(data[startPos]) text = data[startPos + 1:startPos + 1 + textLength] tagMap = getTagMap(controlByteCount, tagTable, data, startPos + 1 + textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups( text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[ 0x02][0] if entryEndPosition in positionMap: positionMap[ entryEndPosition] = "</idx:entry>" + positionMap[ entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getPositionMap (self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) print "\nParsing metaInflIndexData" midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr['count'] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr['len'] inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) print "\nParsing metaOrthIndex" idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr['len'] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr['count'] print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable if hordt2 is not None: print "orth entry uses ordt2 lookup table of type ", idxhdr['otype'] hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo['start'] entryCount = hdrinfo['count'] idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j+1] textLength = ord(data[startPos]) text = data[startPos+1:startPos+1+textLength] if hordt2 is not None: utext = u"" if idxhdr['otype'] == 0: pattern = '>H' inc = 2 else: pattern = '>B' inc = 1 pos = 0 while pos < textLength: off, = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode('utf-8') tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getPositionMap (self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18) if metaIndexCount != 1: print "Error: Dictionary contains multiple inflection index sections, which is not yet supported" decodeInflection = False inflIndexData = sect.loadSection(metaInflIndex + 1) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04) inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) tagSectionStart, = struct.unpack_from('>L', data, 0x04) controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount, = struct.unpack_from('>L', data, 0x18) print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j+1] textLength = ord(data[startPos]) text = data[startPos+1:startPos+1+textLength] tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap