Example #1
0
 def getMatchChineseWords(self):  
     #use cache,check it   
     for i in range(self.cacheSize):  
         if self.cache[i][0] == self.pos:  
             return self.cache[i][1]  
           
     originalPos = self.pos  
     words = []  
     index = 0  
     while self.pos < self.textLength:  
         if index >= maxWordLength :  
             break  
         if not isChineseChar(self.getNextChar()):  
             break  
         self.pos += 1  
         index += 1  
           
         text = self.text[originalPos:self.pos]
         word = getDictWord(text)  
         if word:  
             words.append(word)  
               
     self.pos = originalPos  
     #没有词则放置个‘X’,将文本长度标记为-1  
     if not words:  
         word = Word()  
         word.length = -1  
         word.text = 'X'  
         words.append(word)  
       
     self.cache[self.cacheIndex] = (self.pos,words)  
     self.cacheIndex += 1  
     if self.cacheIndex >= self.cacheSize:  
         self.cacheIndex = 0  
     return words  
Example #2
0
 def getASCIIWords(self):  
     # Skip pre-word whitespaces and punctuations  
     #跳过中英文标点和空格  
     while self.pos < self.textLength:  
         ch = self.getNextChar()  
         if isASCIIChar(ch) or isChineseChar(ch):  
             break  
         self.pos += 1  
     #得到英文单词的起始位置      
     start = self.pos  
       
     #找出英文单词的结束位置  
     while self.pos < self.textLength:  
         ch = self.getNextChar()  
         if not isASCIIChar(ch):  
             break  
         self.pos += 1  
     end = self.pos  
       
     ##Skip chinese word whitespaces and punctuations  
     ##跳过中英文标点和空格  
     #while self.pos < self.textLength:  
     #    ch = self.getNextChar()  
     #    if isASCIIChar(ch) or isChineseChar(ch):  
     #        break  
     #    self.pos += 1  
           
     #返回英文单词  
     return self.text[start:end]  
Example #3
0
    def getMatchChineseWords(self):
        #use cache,check it
        for i in range(self.cacheSize):
            if self.cache[i][0] == self.pos:
                return self.cache[i][1]

        originalPos = self.pos
        words = []
        index = 0
        while self.pos < self.textLength:
            if index >= maxWordLength:
                break
            if not isChineseChar(self.getNextChar()):
                break
            self.pos += 1
            index += 1

            text = self.text[originalPos:self.pos]
            word = getDictWord(text)
            if word:
                words.append(word)

        self.pos = originalPos
        #没有词则放置个‘X’,将文本长度标记为-1
        if not words:
            word = Word()
            word.length = -1
            word.text = 'X'
            words.append(word)

        self.cache[self.cacheIndex] = (self.pos, words)
        self.cacheIndex += 1
        if self.cacheIndex >= self.cacheSize:
            self.cacheIndex = 0
        return words
Example #4
0
    def getASCIIWords(self):
        # Skip pre-word whitespaces and punctuations
        #跳过中英文标点和空格
        while self.pos < self.textLength:
            ch = self.getNextChar()
            if isASCIIChar(ch) or isChineseChar(ch):
                break
            self.pos += 1
        #得到英文单词的起始位置
        start = self.pos

        #找出英文单词的结束位置
        while self.pos < self.textLength:
            ch = self.getNextChar()
            if not isASCIIChar(ch):
                break
            self.pos += 1
        end = self.pos

        ##Skip chinese word whitespaces and punctuations
        ##跳过中英文标点和空格
        #while self.pos < self.textLength:
        #    ch = self.getNextChar()
        #    if isASCIIChar(ch) or isChineseChar(ch):
        #        break
        #    self.pos += 1

        #返回英文单词
        return self.text[start:end]
Example #5
0
 def getNextToken(self):  
     if self.pos < self.textLength:  
         if isChineseChar(self.getNextChar()):  
             token = self.getChineseWords()  
         else :  
             token = self.getASCIIWords()+'/'  
         if len(token) > 0:  
             return token  
     return None  
Example #6
0
 def getNextToken(self):
     if self.pos < self.textLength:
         if isChineseChar(self.getNextChar()):
             token = self.getChineseWords()
         else:
             token = self.getASCIIWords() + '/'
         if len(token) > 0:
             return token
     return None