def __init__( self ) : self.dict = Dictionary() self.spliter = PinyinSpliter() self.fitter = Fitter() self.picker = Picker( self.dict ) #self.picker.set( [], [], True ) self.cache = [ [ 0, [], "" ] ] self.candCacheIndex = 0 self.candStartIndex = 0 self.candList = []
class PinyinLookup() : def __init__( self ) : self.dict = Dictionary() self.spliter = PinyinSpliter() self.fitter = Fitter() self.picker = Picker( self.dict ) #self.picker.set( [], [], True ) self.cache = [ [ 0, [], "" ] ] self.candCacheIndex = 0 self.candStartIndex = 0 self.candList = [] def load( self, filePath ) : newKeys = self.dict.load( filePath ) print "start build index" newPinyinSet = set() for key in newKeys : if key.count( "'" ) <= 0 : self.fitter.pinyinSet.add( key ) newPinyinSet.add( key ) self.fitter.dictTree.addKey( key ) for pinyin in newPinyinSet : self.spliter.beginCharSet.add( pinyin[0] ) self.spliter.pinyinTree.addPath( pinyin ) print "built" def update( self, key, word, freq ) : newKey = self.dict.update( key, word, freq ) if newKey : if newKey.count( "'" ) <= 0 : self.fitter.pinyinSet.add( newKey ) self.spliter.beginCharSet.add( newKey[0] ) self.spliter.pinyinTree.addPath( newKey ) self.fitter.dictTree.addKey( newKey ) def subFit( self, fitList, pinyinStringList ) : subFitPoint = -999 #for key in fitList : for i in range( len( fitList ) ) : key = fitList[i] #currentSubFitPoint = len( key ) - key.count( "'" ) - len( self.spliter.code ) currentSubFitPoint = key.count( "'" ) + 1 - len( pinyinStringList[i].string ) #print key, pinyinStringList[i].string, currentSubFitPoint if currentSubFitPoint > 0 : currentSubFitPoint = -998 if currentSubFitPoint > subFitPoint : subFitPoint = currentSubFitPoint #print key, currentSubFitPoint, subFitPoint newFitList = [] preeditList = [] for i in range( len( fitList ) ) : key = fitList[i] currentSubFitPoint = key.count( "'" ) + 1 - len( pinyinStringList[i].string ) if currentSubFitPoint >= subFitPoint : newFitList.append( key ) preeditList.append( str( pinyinStringList[i] ) ) #print newFitList, newPreeditList return newFitList, preeditList def append( self, code ) : #print "append", code self.spliter.append( code ) fitList = [] #preeditList = [] pinyinStringList = [] fitPoint = -999 for pinyinString in self.spliter.stack : #print pinyinString if pinyinString.length < len( self.spliter.code ) : pass else : currentFitPoint, keys = self.fitter.fit( pinyinString.string ) #print currentFitPoint, keys if currentFitPoint > fitPoint : fitPoint = currentFitPoint fitList = [] preeditList = [] fitList.extend( keys ) #preeditList.extend( [ str( pinyinString ) ] * len( keys ) ) pinyinStringList.extend( [ pinyinString ] * len( keys ) ) elif currentFitPoint == fitPoint : fitList.extend( keys ) #preeditList.extend( [ str( pinyinString ) ] * len( keys ) ) pinyinStringList.extend( [ pinyinString ] * len( keys ) ) fitList, preeditList = self.subFit( fitList, pinyinStringList ) #print fitList self.picker.set( fitList, preeditList, True ) cache = [ fitPoint, fitList, preeditList ] self.cache.append( cache ) self.candList = [] self.candCacheIndex = len( self.cache ) - 1 self.candStartIndex = 0 def pop( self ) : if len( self.cache ) > 1 : self.spliter.pop() self.cache = self.cache[:-1] cache = self.cache[-1] fitList = cache[1] preeditList = cache[2] self.picker.set( fitList, preeditList, True ) self.candList = [] self.candCacheIndex = len( self.cache ) - 1 self.candStartIndex = 0 def checkCache( self ) : fitList = [] cache = self.cache[self.candCacheIndex] currentFitPoint = cache[0] while self.candCacheIndex >= 1 : self.candCacheIndex -= 1 cache = self.cache[self.candCacheIndex] fitPoint = cache[0] fitList = cache[1] preeditList = cache[2] #print self.candCacheIndex, fitList if len( fitList ) >= 0 : if len( self.candList ) <= 0 : break elif fitPoint >= currentFitPoint : break if self.candCacheIndex >= 1 : self.picker.set( fitList, preeditList, False ) return True else : return False def getCand( self, index ) : flag = True while flag and len( self.candList ) <= index : key, word, freq, preeditString = self.picker.pick() if key : self.candList.append( [ key, word, freq, preeditString, self.candStartIndex ] ) else : flag = self.checkCache() self.candStartIndex = len( self.candList ) if flag : return self.candList[index] else : return None #print candList def clear( self ) : self.spliter.clear() self.picker.set( [], [], True ) self.cache = [ [ 0, [], "" ] ] self.candList = [] self.candCacheIndex = 0 self.candStartIndex = 0