class FMMSeg(object): """A forward maximum matching Chinese word segmentor. """ def __init__(self, wordtrie=None, train=None): """Construct a FMM Chinese word segmentor. @type train: an iterable of words @param train: training set @type wordtrie: a trie of words @param wordtrie: previously trained trie If wordtrie is provided, it's deepcopied as the initial trie, otherwise a new blank trie will be constructed. If train is provided, it's appended into the trie above. """ if wordtrie: self._trie = deepcopy(wordtrie) else: self._trie = Trie() if train: self.add_words(train) def add_words(self, train): """Add train words into the trie. @type train: an iterable of words @param train: (possibly) new words """ for word in train: self._trie[word] = word def seg(self, sent): """Segment a sentence. @type sent: unicode string @param sent: the sentence to be segmented @return: a list of segmented words """ words = [] offset = 0 idx = self._trie.longest_prefix(sent, offset) while offset < len(sent): if idx is None: # the first character is not found in our trie, so # treat it as a whole word idx = offset + 1 words.append(sent[offset:idx]) offset = idx idx = self._trie.longest_prefix(sent, offset) return words
def __init__(self, wordtrie=None, train=None): """Construct a FMM Chinese word segmentor. @type train: an iterable of words @param train: training set @type wordtrie: a trie of words @param wordtrie: previously trained trie If wordtrie is provided, it's deepcopied as the initial trie, otherwise a new blank trie will be constructed. If train is provided, it's appended into the trie above. """ if wordtrie: self._trie = deepcopy(wordtrie) else: self._trie = Trie() if train: self.add_words(train)