Beispiel #1
0
class DartsDictionary:
    def __init__(self, dataReader):
        self.readData(dataReader)

    def readData(self, dictFile):
        magicId = dictFile.read(4)
        if magicId != b'JDC0':
            raise RuntimeError('File is not JDIC file')
        charSetBuffer, = struct.unpack('32s', dictFile.read(32))
        self.charset = extractString(charSetBuffer).lower()
        fmt = 'III'
        (dartsSize, entryOffsetBlobSize, entryBlobSize) = \
             struct.unpack('III', dictFile.read(struct.calcsize(fmt)))
        self.lookupDict = DoubleArray(dictFile.read(dartsSize))
        self.entryOffsetBlob = dictFile.read(entryOffsetBlobSize)
        self.entryBlob = dictFile.read(entryBlobSize)

    def getFirstReadingAndDefinition(self, word):
        entries = self.getAllReadingAndDefinition(word)
        return entries[0] if len(entries) else (None, None)

    def getAllReadingAndDefinition(self, word):
        assert(len(word))
        entries = []
        offsets = self.lookupDict.exactMatchSearch(bytearray(word, 'utf-8'))
        for tokenHandler, tokenLength in offsets:
            entryNum = tokenHandler & 0xff
            entryOffsetStartPos = tokenHandler >> 8
            for i in range(entryNum):
                offset = self.getEntryOffset(entryOffsetStartPos + i)
                entry = self.getEntry(offset)
                (kanji, kana, text) = entry.split(b'\x01')
                kanji = text_type(kanji, 'utf-8')
                kana = text_type(kana, 'utf-8')
                text = text_type(text, 'utf-8')
                entries.append((kana, text))
        return entries

    def getBestReadingAndDefinition(self, word, pos):
        entries = self.getAllReadingAndDefinition(word)

    def getEntryOffset(self, entryOffsetIdx):
        fmt = 'I'
        offsetSize = struct.calcsize(fmt)
        offset, = struct.unpack(fmt, self.entryOffsetBlob[entryOffsetIdx * offsetSize : (entryOffsetIdx + 1) * offsetSize])
        return offset

    def getEntry(self, entryOffset):
        endOfEntry = self.entryBlob.find(b'\x00', entryOffset)
        if endOfEntry >= 0:
            return self.entryBlob[entryOffset:endOfEntry]
        else:
            return None

    def splitEntry(self, entry):
        entry.split(b'\x01')
Beispiel #2
0
 def readData(self, dictFile):
     magicId = dictFile.read(4)
     if magicId != b'JDC0':
         raise RuntimeError('File is not JDIC file')
     charSetBuffer, = struct.unpack('32s', dictFile.read(32))
     self.charset = extractString(charSetBuffer).lower()
     fmt = 'III'
     (dartsSize, entryOffsetBlobSize, entryBlobSize) = \
          struct.unpack('III', dictFile.read(struct.calcsize(fmt)))
     self.lookupDict = DoubleArray(dictFile.read(dartsSize))
     self.entryOffsetBlob = dictFile.read(entryOffsetBlobSize)
     self.entryBlob = dictFile.read(entryBlobSize)
Beispiel #3
0
 def loadFromBinary(self, dictFile):
     """
         Loads the dictionary from the blob
     """
     fmt = str('<IIIIIIIIII')
     header = dictFile.read(calcsize(fmt))
     magic, version, dictType, lexSize, \
     leftSize, rightSize, dataSize, \
     tokenPartSize, featurePartSize, dummy = \
         unpack(fmt, header)
     if version != 102:
         raise RuntimeError('Incompatible dictionary version: {0}'.format(version))
     charSetBuffer, = unpack(str('32s'), dictFile.read(32))
     self.charset = extractString(charSetBuffer).lower()
     self.doubleArray = DoubleArray(dictFile.read(dataSize))
     self.tokenBlob = dictFile.read(tokenPartSize)
     self.featureBlob = dictFile.read(featurePartSize)
Beispiel #4
0
class Dictionary:
    def __init__(self, loader, dicName):
        self.charset = None
        self.tokenBlob = []
        self.featureBlob = None
        self.doubleArray = None
        with loader.load(dicName) as dataReader:
            self.loadFromBinary(dataReader)

    def getToken(self, tokenId):
        fmt = str('HHHhII')
        tokenSize = calcsize(fmt)
        #NOTE: dictionary tokens don't store their texts,
        #      which are available either looking up the token features
        #      or during the parsing
        fields = unpack(fmt, self.tokenBlob[tokenId * tokenSize : (tokenId + 1) * tokenSize])
        return Token('', fields[0], fields[1], fields[2],
                         fields[3], fields[4], fields[5] )

    def loadFeatures(self, data):
        """
            Loads all features from the dictionary.
            Note: it's a time consuming function, only for service purposes
        """
        idx = 0
        while(idx <= len(data)):
            strEnd = data.find(b'\x00')
            if strEnd >= 0:
                feature = str(data[idx:strEnd], self.getCharSet())
                self.__features.append(feature)
                idx = strEnd + 1
            else:
                return

    def loadFromBinary(self, dictFile):
        """
            Loads the dictionary from the blob
        """
        fmt = str('<IIIIIIIIII')
        header = dictFile.read(calcsize(fmt))
        magic, version, dictType, lexSize, \
        leftSize, rightSize, dataSize, \
        tokenPartSize, featurePartSize, dummy = \
            unpack(fmt, header)
        if version != 102:
            raise RuntimeError('Incompatible dictionary version: {0}'.format(version))
        charSetBuffer, = unpack(str('32s'), dictFile.read(32))
        self.charset = extractString(charSetBuffer).lower()
        self.doubleArray = DoubleArray(dictFile.read(dataSize))
        self.tokenBlob = dictFile.read(tokenPartSize)
        self.featureBlob = dictFile.read(featurePartSize)


    def loadTokens(self, dictFile, tokenPartSize):
        fmt = str('HHHhII')
        tokenSize = calcsize(fmt)
        for i in range(tokenPartSize//tokenSize):
            data = dictFile.read(tokenSize)
            #NOTE: dictionary tokens don't store their texts,
            #      which are available either looking up the token features
            #      or during the parsing
            fields = unpack(fmt, data)
            self.__tokens.append(Token('', fields[0], fields[1], fields[2],
                                           fields[3], fields[4], fields[5] ))

    def internalCommonPrefixSearch(self, text):
        encodedText = bytearray(text, self.getCharSet())
        return self.doubleArray.commonPrefixSearch(encodedText)

    def commonPrefixSearch(self, text):
        """
        Looks up all words matching a part of the text starting from
        the beginning.
        E.g. 'abcd' produces 'a', 'ab' 'abc' given the latters are
        actual words
        """
        try:
            encodedText = bytearray(text, self.getCharSet())
            return self.internalSearch(encodedText, self.doubleArray.commonPrefixSearch)
        except UnicodeEncodeError as e:
            posError = e.start
            if posError == 0:
                return []
            else:
                return self.commonPrefixSearch(text[:posError])
            #raise RuntimeError(text_type(text) + ': ' + str(e))
            #return []
        #return self.internalSearch(text, self.doubleArray.commonPrefixSearch)

    def internalSearch(self, encodedText, functionToMatch):
        tokens = []
##        try:
##            encodedText = bytearray(text, self.getCharSet())
##        except UnicodeEncodeError as e:
##            z = e.start
##            raise RuntimeError(text_type(text) + ': ' + str(e))
        tokenStartIds = functionToMatch(encodedText)
        for tokenHandler, tokenLength in tokenStartIds:
            tokenNum = tokenHandler & 0xff
            tokenStartId = tokenHandler >> 8
            for i in range(tokenNum):
                d = self.getToken(tokenStartId + i)
                tokenText = text_type(bytes(encodedText[:tokenLength]), self.getCharSet())
                t = Token(tokenText, d.leftAttribute,
                          d.rightAttribute, d.partOfSpeechId,
                          d.wordCost, d.featureId, d.compound)
                tokens.append(t)
        return tokens

    def exactMatchSearch(self, text):
        """
            Looks up all words matching the text starting from
            the beginning.
            E.g. 'abcd' produces 'abcd'  given the latter is an existing word.
        """
        try:
            encodedText = bytearray(text, self.getCharSet())
            return self.internalSearch(encodedText, self.doubleArray.exactMatchSearch)
        except UnicodeEncodeError as e:
          #  posError = e.start
          #  raise RuntimeError(text_type(text) + ': ' + str(e))
            return []

    def getCharSet(self):
        """
            Gets the dictionary charset: euc-jp, utf-8, etc.
        """
        return self.charset

    def getFeature(self, featureId):
        """
            Gets the dictionary entry for the word
        """
        strEnd = self.featureBlob.find(b'\x00', featureId)
        if strEnd >= 0:
            feature = text_type(self.featureBlob[featureId:strEnd], self.getCharSet())
            return feature
        else:
            return None