Example #1
0
    def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Unigram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()

        if isinstance(tagged_corpus, list) and isinstance(
                tagged_corpus[0], tuple):
            tagged_corpus = [tagged_corpus]

        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                fd[token].inc(tag)
        for token in fd.conditions():
            best_tag = fd[token].max()
            backoff_tag = self._backoff_tag_one(token)
            hits = fd[token].count(best_tag)

            # is the tag we would assign different from the backoff tagger
            # and do we have sufficient evidence?
            if best_tag != backoff_tag and hits > self._cutoff:
                self._model[token] = best_tag
                hit_count += hits

        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff,
                                                                pruning)
Example #2
0
    def train(self, tagged_corpus, verbose=False):
        """
        Train this C{tagger.Ngram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            self._history.clear()
            for (token, tag) in sentence:
                token_count += 1
                history = tuple(self._history)
                fd[(history, token)].inc(tag)
                self._history.append(tag)
        for context in fd.conditions():
            best_tag = fd[context].max()
            history = tuple(self._history)
            backoff_tag = self._backoff_tag_one(token, history)
            hits = fd[context].count(best_tag)

            # is the tag we would assign different from the backoff tagger
            # and do we have sufficient evidence?
            if best_tag != backoff_tag and hits > self._cutoff:
                self._model[context] = best_tag
                hit_count += hits

        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained %d-gram tagger:" % self._n,
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Example #3
0
File: ngram.py Project: mgolden/en
    def train(self, tagged_corpus, verbose=False):
        """
        Train this C{tagger.Ngram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError('Tagger is already trained')
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            self._history.clear()
            for (token, tag) in sentence:
                token_count += 1
                history = tuple(self._history)
                fd[(history, token)].inc(tag)
                self._history.append(tag)
        for context in fd.conditions():
            best_tag = fd[context].max()
            history = tuple(self._history)
            backoff_tag = self._backoff_tag_one(token, history)
            hits = fd[context].count(best_tag)

            # is the tag we would assign different from the backoff tagger
            # and do we have sufficient evidence?
            if best_tag != backoff_tag and hits > self._cutoff:
                self._model[context] = best_tag
                hit_count += hits

        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained %d-gram tagger:" % self._n, end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" %
                  (size, backoff, pruning))
Example #4
0
    def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Unigram} using the given training data.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()

        if isinstance(tagged_corpus, list) and isinstance(tagged_corpus[0], tuple):
            tagged_corpus = [tagged_corpus]

        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                fd[token].inc(tag)
        for token in fd.conditions():
            best_tag = fd[token].max()
            backoff_tag = self._backoff_tag_one(token)
            hits = fd[token].count(best_tag)

            # is the tag we would assign different from the backoff tagger
            # and do we have sufficient evidence?
            if best_tag != backoff_tag and hits > self._cutoff:
                self._model[token] = best_tag
                hit_count += hits
            
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
Example #5
0
    def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Affix} using the given training data. If this
        method is called multiple times, then the training data will be
        combined.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError('Tagger is already trained')
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        
        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                # If token is long enough
                if len(token) >= self._minlength:
                    backoff_tag = self._backoff_tag_one(token)
                    if tag != backoff_tag:
                        # get the affix and record it
                        affix = self._get_affix(token)
                        hit_count += 1
                        fd[affix].inc(tag)
        for affix in fd.conditions():
            best_tag = fd[affix].max()
            if fd[affix].count(best_tag) > self._cutoff:
                self._model[affix] = best_tag
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Affix tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))
Example #6
0
    def train(self, tagged_corpus, verbose=False):
        """
        Train C{tag.Affix} using the given training data. If this
        method is called multiple times, then the training data will be
        combined.
        
        @param tagged_corpus: A tagged corpus.  Each item should be
            a C{list} of tagged tokens, where each consists of
            C{text} and a C{tag}.
        @type tagged_corpus: C{list} or C{iter(list)}
        """

        if self.size() != 0:
            raise ValueError, 'Tagger is already trained'
        token_count = hit_count = 0
        fd = ConditionalFreqDist()
        
        for sentence in tagged_corpus:
            for (token, tag) in sentence:
                token_count += 1
                # If token is long enough
                if len(token) >= self._minlength:
                    backoff_tag = self._backoff_tag_one(token)
                    if tag != backoff_tag:
                        # get the affix and record it
                        affix = self._get_affix(token)
                        hit_count += 1
                        fd[affix].inc(tag)
        for affix in fd.conditions():
            best_tag = fd[affix].max()
            if fd[affix].count(best_tag) > self._cutoff:
                self._model[affix] = best_tag
        # generate stats
        if verbose:
            size = len(self._model)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Affix tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)