Exemple #1
0
def generate_corpus_text(corpus, filename: Path) -> None:
    """
    Generates a text file of the given filename (absolute path) from the given
    corpus.
    """
    # Get a list of sentences from the corpus
    raw_sentences = corpus.sents()

    processed_sentences = []
    for sentence in raw_sentences:
        joined_sentence = " ".join(word for word in sentence)

        processed_sentence = f"{clean_text(joined_sentence).strip()}\n"

        # Checks for sentence length and whether sentence contains an unallowed word
        conditions_met = SENTENCE_MIN_LEN <= len(
            processed_sentence) <= SENTENCE_MAX_LEN and not any(
                (word in processed_sentence) for word in DISALLOWED_WORDS)

        if conditions_met:
            processed_sentences.append(processed_sentence)

    # Write the processed sentences to a text file
    with open(filename, "wt") as corpus_file:
        corpus_file.writelines(processed_sentences)
Exemple #2
0
def describe(corpus):
  print "\t".join(["c/w", "w/s", "w/v", "id"])
  for fileid in corpus.fileids():
    nchars = len(corpus.raw(fileid))
    nwords = len(corpus.words(fileid))
    nsents = len(corpus.sents(fileid))
    nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
    print "\t".join([str(nchars/nwords), str(nwords/nsents),
      str(nwords/nvocab), fileid])
Exemple #3
0
def describe(corpus):
    print "\t".join(["c/w", "w/s", "w/v", "id"])
    for fileid in corpus.fileids():
        nchars = len(corpus.raw(fileid))
        nwords = len(corpus.words(fileid))
        nsents = len(corpus.sents(fileid))
        nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
        print "\t".join([
            str(nchars / nwords),
            str(nwords / nsents),
            str(nwords / nvocab), fileid
        ])
	print ''
	print '  '.join(['Tag'.center(taglen), 'Found'.center(9), 'Actual'.center(10),
					'Precision'.center(13), 'Recall'.center(13)])
	print '  '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13])
	
	for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
		found = tags_found[tag]
		actual = tags_actual[tag]
		precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
		recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
		print '  '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10),
			str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]])
	
	print '  '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13])
else:
	sents = corpus.sents(**kwargs)
	taglen = 7
	
	if args.fraction != 1.0:
		cutoff = int(math.ceil(len(sents) * args.fraction))
		sents = sents[:cutoff]
	
	for sent in sents:
		for word, tag in tagger.tag(sent):
			tags_found.inc(tag)
			
			if len(tag) > taglen:
				taglen = len(tag)
	
	print '  '.join(['Tag'.center(taglen), 'Count'.center(9)])
	print '  '.join(['='*taglen, '='*9])
Exemple #5
0
        found = tags_found[tag]
        actual = tags_actual[tag]
        precision = nltk.metrics.precision(tag_word_refs[tag],
                                           tag_word_test[tag])
        recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
        print '  '.join([
            tag.ljust(taglen),
            str(found).rjust(9),
            str(actual).rjust(10),
            str(precision).ljust(13)[:13],
            str(recall).ljust(13)[:13]
        ])

    print '  '.join(['=' * taglen, '=' * 9, '=' * 10, '=' * 13, '=' * 13])
else:
    sents = corpus.sents(**kwargs)
    taglen = 7

    if args.fraction != 1.0:
        cutoff = int(math.ceil(len(sents) * args.fraction))
        sents = sents[:cutoff]

    for sent in sents:
        for word, tag in tagger.tag(sent):
            tags_found.inc(tag)

            if len(tag) > taglen:
                taglen = len(tag)

    print '  '.join(['Tag'.center(taglen), 'Count'.center(9)])
    print '  '.join(['=' * taglen, '=' * 9])
	if args.trace:
		print 'evaluating chunker score\n'
	
	chunked_sents = corpus.chunked_sents()
	
	if args.fraction != 1.0:
		cutoff = int(math.ceil(len(chunked_sents) * args.fraction))
		chunked_sents = chunked_sents[:cutoff]
	
	print chunker.evaluate(chunked_sents), '\n'

if args.trace:
	print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__)

iobs_found = FreqDist()
sents = corpus.sents()

if args.fraction != 1.0:
	cutoff = int(math.ceil(len(sents) * args.fraction))
	sents = sents[:cutoff]

for sent in sents:
	tree = chunker.parse(tagger.tag(sent))
	
	for child in tree.subtrees(lambda t: t.node != 'S'):
		iobs_found.inc(child.node)

iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])

print 'IOB'.center(justify) + '    Found  '
	print 'Accuracy: %f' % nltk.metrics.accuracy(tag_refs, tag_test)
	print 'Unknown words: %d' % len(unknown_words)
	
	if args.trace:
		print ', '.join(sorted(unknown_words))
	
	print ''
	print '  Tag      Found      Actual      Precision      Recall  '
	print '=======  =========  ==========  =============  =========='
	
	for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
		found = tags_found[tag]
		actual = tags_actual[tag]
		precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
		recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
		print '  '.join([tag.ljust(7), str(found).rjust(9), str(actual).rjust(10),
			str(precision).ljust(13)[:13], str(recall).ljust(10)[:13]])
	
	print '=======  =========  ==========  =============  =========='
else:
	for sent in corpus.sents():
		for word, tag in tagger.tag(sent):
			tags_found.inc(tag)
	
	print '  Tag      Found  '
	print '=======  ========='
	
	for tag in sorted(tags_found.samples()):
		print '  '.join([tag.ljust(7), str(tags_found[tag]).rjust(9)])
	
	print '=======  ========='
Exemple #8
0
        print 'evaluating chunker score\n'

    chunked_sents = corpus.chunked_sents()

    if args.fraction != 1.0:
        cutoff = int(math.ceil(len(chunked_sents) * args.fraction))
        chunked_sents = chunked_sents[:cutoff]

    print chunker.evaluate(chunked_sents), '\n'

if args.trace:
    print 'analyzing chunker coverage of %s with %s\n' % (
        args.corpus, chunker.__class__.__name__)

iobs_found = FreqDist()
sents = corpus.sents()

if args.fraction != 1.0:
    cutoff = int(math.ceil(len(sents) * args.fraction))
    sents = sents[:cutoff]

for sent in sents:
    tree = chunker.parse(tagger.tag(sent))

    for child in tree.subtrees(lambda t: t.node != 'S'):
        iobs_found.inc(child.node)

iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])

print 'IOB'.center(justify) + '    Found  '
Exemple #9
0
    This/det is/verb the/det first/adj sentence/noun ./punc
    Here/det  is/verb  another/adj    sentence/noun ./punc
    Note/verb that/comp you/pron can/verb use/verb
    any/noun tag/noun set/noun

    This/det is/verb the/det second/adj paragraph/noun ./punc
    word/n without/adj a/det tag/noun :/: hello ./punc
    """,
                       b="""
    This/det is/verb the/det second/adj file/noun ./punc
    """)
corpus = TaggedCorpusReader(root, list('ab'))
print(corpus.fileids())
print(str(corpus.root) == str(root))
print(corpus.words())
print(corpus.sents())  # doctest: +ELLIPSIS
print(corpus.paras())  # doctest: +ELLIPSIS
print(corpus.tagged_words())  # doctest: +ELLIPSIS
print(corpus.tagged_sents())  # doctest: +ELLIPSIS
print(corpus.tagged_paras())  # doctest: +ELLIPSIS
print(corpus.raw()[:40])
print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()])
print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()])
print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()])
print(corpus.words('a'))
print(corpus.words('b'))
# del_testcorpus(root)
print(brown.fileids())  # doctest: +ELLIPSIS
print(brown.categories())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(repr(brown.root).replace('\\\\', '/'))  # doctest: +ELLIPSIS
print(brown.words())