def getMatchChineseWords(self): #use cache,check it for i in range(self.cacheSize): if self.cache[i][0] == self.pos: return self.cache[i][1] originalPos = self.pos words = [] index = 0 while self.pos < self.textLength: if index >= maxWordLength : break if not isChineseChar(self.getNextChar()): break self.pos += 1 index += 1 text = self.text[originalPos:self.pos] word = getDictWord(text) if word: words.append(word) self.pos = originalPos #没有词则放置个‘X’,将文本长度标记为-1 if not words: word = Word() word.length = -1 word.text = 'X' words.append(word) self.cache[self.cacheIndex] = (self.pos,words) self.cacheIndex += 1 if self.cacheIndex >= self.cacheSize: self.cacheIndex = 0 return words
def getASCIIWords(self): # Skip pre-word whitespaces and punctuations #跳过中英文标点和空格 while self.pos < self.textLength: ch = self.getNextChar() if isASCIIChar(ch) or isChineseChar(ch): break self.pos += 1 #得到英文单词的起始位置 start = self.pos #找出英文单词的结束位置 while self.pos < self.textLength: ch = self.getNextChar() if not isASCIIChar(ch): break self.pos += 1 end = self.pos ##Skip chinese word whitespaces and punctuations ##跳过中英文标点和空格 #while self.pos < self.textLength: # ch = self.getNextChar() # if isASCIIChar(ch) or isChineseChar(ch): # break # self.pos += 1 #返回英文单词 return self.text[start:end]
def getMatchChineseWords(self): #use cache,check it for i in range(self.cacheSize): if self.cache[i][0] == self.pos: return self.cache[i][1] originalPos = self.pos words = [] index = 0 while self.pos < self.textLength: if index >= maxWordLength: break if not isChineseChar(self.getNextChar()): break self.pos += 1 index += 1 text = self.text[originalPos:self.pos] word = getDictWord(text) if word: words.append(word) self.pos = originalPos #没有词则放置个‘X’,将文本长度标记为-1 if not words: word = Word() word.length = -1 word.text = 'X' words.append(word) self.cache[self.cacheIndex] = (self.pos, words) self.cacheIndex += 1 if self.cacheIndex >= self.cacheSize: self.cacheIndex = 0 return words
def getNextToken(self): if self.pos < self.textLength: if isChineseChar(self.getNextChar()): token = self.getChineseWords() else : token = self.getASCIIWords()+'/' if len(token) > 0: return token return None
def getNextToken(self): if self.pos < self.textLength: if isChineseChar(self.getNextChar()): token = self.getChineseWords() else: token = self.getASCIIWords() + '/' if len(token) > 0: return token return None