Beispiel #1
0
class FMMSeg(object):
    """A forward maximum matching Chinese word segmentor.
    """

    def __init__(self, wordtrie=None, train=None):
        """Construct a FMM Chinese word segmentor.

        @type train: an iterable of words
        @param train: training set
        @type wordtrie: a trie of words
        @param wordtrie: previously trained trie

        If wordtrie is provided, it's deepcopied as the initial trie,
        otherwise a new blank trie will be constructed.

        If train is provided, it's appended into the trie above.
        """
        if wordtrie:
            self._trie = deepcopy(wordtrie)
        else:
            self._trie = Trie()
        if train:
            self.add_words(train)

    def add_words(self, train):
        """Add train words into the trie.

        @type train: an iterable of words
        @param train: (possibly) new words
        """
        for word in train:
            self._trie[word] = word

    def seg(self, sent):
        """Segment a sentence.

        @type sent: unicode string
        @param sent: the sentence to be segmented

        @return: a list of segmented words
        """
        words = []
        offset = 0
        idx = self._trie.longest_prefix(sent, offset)
        while offset < len(sent):
            if idx is None:
                # the first character is not found in our trie, so
                # treat it as a whole word
                idx = offset + 1
            words.append(sent[offset:idx])
            offset = idx
            idx = self._trie.longest_prefix(sent, offset)
        return words
Beispiel #2
0
class FMMSeg(object):
    """A forward maximum matching Chinese word segmentor.
    """
    def __init__(self, wordtrie=None, train=None):
        """Construct a FMM Chinese word segmentor.

        @type train: an iterable of words
        @param train: training set
        @type wordtrie: a trie of words
        @param wordtrie: previously trained trie

        If wordtrie is provided, it's deepcopied as the initial trie,
        otherwise a new blank trie will be constructed.

        If train is provided, it's appended into the trie above.
        """
        if wordtrie:
            self._trie = deepcopy(wordtrie)
        else:
            self._trie = Trie()
        if train:
            self.add_words(train)

    def add_words(self, train):
        """Add train words into the trie.

        @type train: an iterable of words
        @param train: (possibly) new words
        """
        for word in train:
            self._trie[word] = word

    def seg(self, sent):
        """Segment a sentence.

        @type sent: unicode string
        @param sent: the sentence to be segmented

        @return: a list of segmented words
        """
        words = []
        offset = 0
        idx = self._trie.longest_prefix(sent, offset)
        while offset < len(sent):
            if idx is None:
                # the first character is not found in our trie, so
                # treat it as a whole word
                idx = offset + 1
            words.append(sent[offset:idx])
            offset = idx
            idx = self._trie.longest_prefix(sent, offset)
        return words
Beispiel #3
0
    def __init__(self, wordtrie=None, train=None):
        """Construct a FMM Chinese word segmentor.

        @type train: an iterable of words
        @param train: training set
        @type wordtrie: a trie of words
        @param wordtrie: previously trained trie

        If wordtrie is provided, it's deepcopied as the initial trie,
        otherwise a new blank trie will be constructed.

        If train is provided, it's appended into the trie above.
        """
        if wordtrie:
            self._trie = deepcopy(wordtrie)
        else:
            self._trie = Trie()
        if train:
            self.add_words(train)
Beispiel #4
0
    def __init__(self, wordtrie=None, train=None):
        """Construct a FMM Chinese word segmentor.

        @type train: an iterable of words
        @param train: training set
        @type wordtrie: a trie of words
        @param wordtrie: previously trained trie

        If wordtrie is provided, it's deepcopied as the initial trie,
        otherwise a new blank trie will be constructed.

        If train is provided, it's appended into the trie above.
        """
        if wordtrie:
            self._trie = deepcopy(wordtrie)
        else:
            self._trie = Trie()
        if train:
            self.add_words(train)