## coverage analysis ##
#######################

if args.trace:
	print 'analyzing tag coverage of %s with %s\n' % (args.corpus, tagger.__class__.__name__)

tags_found = FreqDist()
unknown_words = set()

if args.metrics:
	tags_actual = FreqDist()
	tag_refs = []
	tag_test = []
	tag_word_refs = collections.defaultdict(set)
	tag_word_test = collections.defaultdict(set)
	tagged_sents = corpus.tagged_sents(**kwargs)
	taglen = 7
	
	if args.fraction != 1.0:
		cutoff = int(math.ceil(len(tagged_sents) * args.fraction))
		tagged_sents = tagged_sents[:cutoff]
	
	for tagged_sent in tagged_sents:
		for word, tag in tagged_sent:
			tags_actual.inc(tag)
			tag_refs.append(tag)
			tag_word_refs[tag].add(word)
			
			if len(tag) > taglen:
				taglen = len(tag)
		
#######################

if args.trace:
	print 'analyzing tag coverage of %s with %s\n' % (args.corpus, tagger.__class__.__name__)

tags_found = FreqDist()
unknown_words = set()

if args.metrics:
	tags_actual = FreqDist()
	tag_refs = []
	tag_test = []
	tag_word_refs = collections.defaultdict(set)
	tag_word_test = collections.defaultdict(set)
	
	for tagged_sent in corpus.tagged_sents(fileids=args.fileids):
		for word, tag in tagged_sent:
			tags_actual.inc(tag)
			tag_refs.append(tag)
			tag_word_refs[tag].add(word)
		
		for word, tag in tagger.tag(nltk.tag.untag(tagged_sent)):
			tags_found.inc(tag)
			tag_test.append(tag)
			tag_word_test[tag].add(word)
			
			if tag == '-NONE-':
				unknown_words.add(word)
	
	print 'Accuracy: %f' % nltk.metrics.accuracy(tag_refs, tag_test)
	print 'Unknown words: %d' % len(unknown_words)
Esempio n. 3
0
#######################

if args.trace:
    print 'analyzing tag coverage of %s with %s\n' % (
        args.corpus, tagger.__class__.__name__)

tags_found = FreqDist()
unknown_words = set()

if args.metrics:
    tags_actual = FreqDist()
    tag_refs = []
    tag_test = []
    tag_word_refs = collections.defaultdict(set)
    tag_word_test = collections.defaultdict(set)
    tagged_sents = corpus.tagged_sents(**kwargs)
    taglen = 7

    if args.fraction != 1.0:
        cutoff = int(math.ceil(len(tagged_sents) * args.fraction))
        tagged_sents = tagged_sents[:cutoff]

    for tagged_sent in tagged_sents:
        for word, tag in tagged_sent:
            tags_actual.inc(tag)
            tag_refs.append(tag)
            tag_word_refs[tag].add(word)

            if len(tag) > taglen:
                taglen = len(tag)
Esempio n. 4
0
    any/noun tag/noun set/noun

    This/det is/verb the/det second/adj paragraph/noun ./punc
    word/n without/adj a/det tag/noun :/: hello ./punc
    """,
                       b="""
    This/det is/verb the/det second/adj file/noun ./punc
    """)
corpus = TaggedCorpusReader(root, list('ab'))
print(corpus.fileids())
print(str(corpus.root) == str(root))
print(corpus.words())
print(corpus.sents())  # doctest: +ELLIPSIS
print(corpus.paras())  # doctest: +ELLIPSIS
print(corpus.tagged_words())  # doctest: +ELLIPSIS
print(corpus.tagged_sents())  # doctest: +ELLIPSIS
print(corpus.tagged_paras())  # doctest: +ELLIPSIS
print(corpus.raw()[:40])
print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()])
print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()])
print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()])
print(corpus.words('a'))
print(corpus.words('b'))
# del_testcorpus(root)
print(brown.fileids())  # doctest: +ELLIPSIS
print(brown.categories())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(repr(brown.root).replace('\\\\', '/'))  # doctest: +ELLIPSIS
print(brown.words())
print(brown.sents())  # doctest: +ELLIPSIS
print(brown.paras())  # doctest: +ELLIPSIS
print(brown.tagged_words())  # doctest: +ELLIPSIS