## coverage analysis ## ####################### if args.trace: print 'analyzing tag coverage of %s with %s\n' % (args.corpus, tagger.__class__.__name__) tags_found = FreqDist() unknown_words = set() if args.metrics: tags_actual = FreqDist() tag_refs = [] tag_test = [] tag_word_refs = collections.defaultdict(set) tag_word_test = collections.defaultdict(set) tagged_sents = corpus.tagged_sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(tagged_sents) * args.fraction)) tagged_sents = tagged_sents[:cutoff] for tagged_sent in tagged_sents: for word, tag in tagged_sent: tags_actual.inc(tag) tag_refs.append(tag) tag_word_refs[tag].add(word) if len(tag) > taglen: taglen = len(tag)
####################### if args.trace: print 'analyzing tag coverage of %s with %s\n' % (args.corpus, tagger.__class__.__name__) tags_found = FreqDist() unknown_words = set() if args.metrics: tags_actual = FreqDist() tag_refs = [] tag_test = [] tag_word_refs = collections.defaultdict(set) tag_word_test = collections.defaultdict(set) for tagged_sent in corpus.tagged_sents(fileids=args.fileids): for word, tag in tagged_sent: tags_actual.inc(tag) tag_refs.append(tag) tag_word_refs[tag].add(word) for word, tag in tagger.tag(nltk.tag.untag(tagged_sent)): tags_found.inc(tag) tag_test.append(tag) tag_word_test[tag].add(word) if tag == '-NONE-': unknown_words.add(word) print 'Accuracy: %f' % nltk.metrics.accuracy(tag_refs, tag_test) print 'Unknown words: %d' % len(unknown_words)
####################### if args.trace: print 'analyzing tag coverage of %s with %s\n' % ( args.corpus, tagger.__class__.__name__) tags_found = FreqDist() unknown_words = set() if args.metrics: tags_actual = FreqDist() tag_refs = [] tag_test = [] tag_word_refs = collections.defaultdict(set) tag_word_test = collections.defaultdict(set) tagged_sents = corpus.tagged_sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(tagged_sents) * args.fraction)) tagged_sents = tagged_sents[:cutoff] for tagged_sent in tagged_sents: for word, tag in tagged_sent: tags_actual.inc(tag) tag_refs.append(tag) tag_word_refs[tag].add(word) if len(tag) > taglen: taglen = len(tag)
any/noun tag/noun set/noun This/det is/verb the/det second/adj paragraph/noun ./punc word/n without/adj a/det tag/noun :/: hello ./punc """, b=""" This/det is/verb the/det second/adj file/noun ./punc """) corpus = TaggedCorpusReader(root, list('ab')) print(corpus.fileids()) print(str(corpus.root) == str(root)) print(corpus.words()) print(corpus.sents()) # doctest: +ELLIPSIS print(corpus.paras()) # doctest: +ELLIPSIS print(corpus.tagged_words()) # doctest: +ELLIPSIS print(corpus.tagged_sents()) # doctest: +ELLIPSIS print(corpus.tagged_paras()) # doctest: +ELLIPSIS print(corpus.raw()[:40]) print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]) print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()]) print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()]) print(corpus.words('a')) print(corpus.words('b')) # del_testcorpus(root) print(brown.fileids()) # doctest: +ELLIPSIS print(brown.categories()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(repr(brown.root).replace('\\\\', '/')) # doctest: +ELLIPSIS print(brown.words()) print(brown.sents()) # doctest: +ELLIPSIS print(brown.paras()) # doctest: +ELLIPSIS print(brown.tagged_words()) # doctest: +ELLIPSIS