def train(self, tagged_corpus, verbose=False): """ Train C{tag.Affix} using the given training data. If this method is called multiple times, then the training data will be combined. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of tagged tokens, where each consists of C{text} and a C{tag}. @type tagged_corpus: C{list} or C{iter(list)} """ if self.size() != 0: raise ValueError, 'Tagger is already trained' token_count = hit_count = 0 fd = ConditionalFreqDist() for sentence in tagged_corpus: for (token, tag) in sentence: token_count += 1 # If token is long enough if len(token) >= self._minlength: backoff_tag = self._backoff_tag_one(token) if tag != backoff_tag: # get the affix and record it affix = self._get_affix(token) hit_count += 1 fd[affix].inc(tag) for affix in fd.conditions(): best_tag = fd[affix].max() if fd[affix].count(best_tag) > self._cutoff: self._model[affix] = best_tag # generate stats if verbose: size = len(self._model) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Affix tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def train(self, tagged_corpus, verbose=False): """ Train C{tag.Unigram} using the given training data. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of tagged tokens, where each consists of C{text} and a C{tag}. @type tagged_corpus: C{list} or C{iter(list)} """ if self.size() != 0: raise ValueError, 'Tagger is already trained' token_count = hit_count = 0 fd = ConditionalFreqDist() if isinstance(tagged_corpus, list) and isinstance(tagged_corpus[0], tuple): tagged_corpus = [tagged_corpus] for sentence in tagged_corpus: for (token, tag) in sentence: token_count += 1 backoff_tag = self._backoff_tag_one(token) if tag != backoff_tag: hit_count += 1 fd[token].inc(tag) for token in fd.conditions(): best_tag = fd[token].max() if fd[token].count(best_tag) > self._cutoff: self._model[token] = best_tag # generate stats if verbose: size = len(self._model) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def train(self, tagged_corpus, verbose=False): """ Train this C{tagger.Ngram} using the given training data. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of tagged tokens, where each consists of C{text} and a C{tag}. @type tagged_corpus: C{list} or C{iter(list)} """ if self.size() != 0: raise ValueError, 'Tagger is already trained' token_count = hit_count = 0 fd = ConditionalFreqDist() for sentence in tagged_corpus: self._history.clear() for (token, tag) in sentence: token_count += 1 history = self._history.get() backoff_tag = self._backoff_tag_one(token, history) if tag != backoff_tag: hit_count += 1 fd[(history, token)].inc(tag) self._history.enqueue(tag) for context in fd.conditions(): best_tag = fd[context].max() if fd[context].count(best_tag) > self._cutoff: self._model[context] = best_tag # generate stats if verbose: size = len(self._model) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained %d-gram tagger:" % self._n, print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)