def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Affix} using the given training data. If this
        method is called multiple times, then the training data will be
        combined.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        
        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                # If token is long enough
                if len(token) >= self._minlength:
                    backoff_tag = self._backoff_tag_one(token)
                    if tag != backoff_tag:
                        # get the affix and record it
                        affix = self._get_affix(token)
                        hit_count += 1
                        fd[affix].inc(tag)
        for affix in fd.conditions():
            best_tag = fd[affix].max()
            if fd[affix].count(best_tag) > self._cutoff:
                self._model[affix] = best_tag
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Affix tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
    def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Unigram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()

        if isinstance(tagged_corpus, list) and isinstance(tagged_corpus[0], tuple):
            tagged_corpus = [tagged_corpus]

        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                backoff_tag = self._backoff_tag_one(token)
                if tag != backoff_tag:
                    hit_count += 1
                    fd[token].inc(tag)
        for token in fd.conditions():
            best_tag = fd[token].max()
            if fd[token].count(best_tag) > self._cutoff:
                self._model[token] = best_tag
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
    def train(self, tagged_corpus, verbose=False):
        """
        Train this C{tagger.Ngram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            self._history.clear()
            for (token, tag) in sentence:
                token_count += 1
                history = self._history.get()
                backoff_tag = self._backoff_tag_one(token, history)
                if tag != backoff_tag:
                    hit_count += 1
                    fd[(history, token)].inc(tag)
                self._history.enqueue(tag)
        for context in fd.conditions():
            best_tag = fd[context].max()
            if fd[context].count(best_tag) > self._cutoff:
                self._model[context] = best_tag
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained %d-gram tagger:" % self._n,
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)