def generate_corpus_text(corpus, filename: Path) -> None: """ Generates a text file of the given filename (absolute path) from the given corpus. """ # Get a list of sentences from the corpus raw_sentences = corpus.sents() processed_sentences = [] for sentence in raw_sentences: joined_sentence = " ".join(word for word in sentence) processed_sentence = f"{clean_text(joined_sentence).strip()}\n" # Checks for sentence length and whether sentence contains an unallowed word conditions_met = SENTENCE_MIN_LEN <= len( processed_sentence) <= SENTENCE_MAX_LEN and not any( (word in processed_sentence) for word in DISALLOWED_WORDS) if conditions_met: processed_sentences.append(processed_sentence) # Write the processed sentences to a text file with open(filename, "wt") as corpus_file: corpus_file.writelines(processed_sentences)
def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([str(nchars/nwords), str(nwords/nsents), str(nwords/nvocab), fileid])
def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([ str(nchars / nwords), str(nwords / nsents), str(nwords / nvocab), fileid ])
print '' print ' '.join(['Tag'.center(taglen), 'Found'.center(9), 'Actual'.center(10), 'Precision'.center(13), 'Recall'.center(13)]) print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13]) for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())): found = tags_found[tag] actual = tags_actual[tag] precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag]) recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag]) print ' '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10), str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]]) print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13]) else: sents = corpus.sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: for word, tag in tagger.tag(sent): tags_found.inc(tag) if len(tag) > taglen: taglen = len(tag) print ' '.join(['Tag'.center(taglen), 'Count'.center(9)]) print ' '.join(['='*taglen, '='*9])
found = tags_found[tag] actual = tags_actual[tag] precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag]) recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag]) print ' '.join([ tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10), str(precision).ljust(13)[:13], str(recall).ljust(13)[:13] ]) print ' '.join(['=' * taglen, '=' * 9, '=' * 10, '=' * 13, '=' * 13]) else: sents = corpus.sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: for word, tag in tagger.tag(sent): tags_found.inc(tag) if len(tag) > taglen: taglen = len(tag) print ' '.join(['Tag'.center(taglen), 'Count'.center(9)]) print ' '.join(['=' * taglen, '=' * 9])
if args.trace: print 'evaluating chunker score\n' chunked_sents = corpus.chunked_sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(chunked_sents) * args.fraction)) chunked_sents = chunked_sents[:cutoff] print chunker.evaluate(chunked_sents), '\n' if args.trace: print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__) iobs_found = FreqDist() sents = corpus.sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: tree = chunker.parse(tagger.tag(sent)) for child in tree.subtrees(lambda t: t.node != 'S'): iobs_found.inc(child.node) iobs = iobs_found.samples() justify = max(7, *[len(iob) for iob in iobs]) print 'IOB'.center(justify) + ' Found '
print 'Accuracy: %f' % nltk.metrics.accuracy(tag_refs, tag_test) print 'Unknown words: %d' % len(unknown_words) if args.trace: print ', '.join(sorted(unknown_words)) print '' print ' Tag Found Actual Precision Recall ' print '======= ========= ========== ============= ==========' for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())): found = tags_found[tag] actual = tags_actual[tag] precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag]) recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag]) print ' '.join([tag.ljust(7), str(found).rjust(9), str(actual).rjust(10), str(precision).ljust(13)[:13], str(recall).ljust(10)[:13]]) print '======= ========= ========== ============= ==========' else: for sent in corpus.sents(): for word, tag in tagger.tag(sent): tags_found.inc(tag) print ' Tag Found ' print '======= =========' for tag in sorted(tags_found.samples()): print ' '.join([tag.ljust(7), str(tags_found[tag]).rjust(9)]) print '======= ========='
print 'evaluating chunker score\n' chunked_sents = corpus.chunked_sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(chunked_sents) * args.fraction)) chunked_sents = chunked_sents[:cutoff] print chunker.evaluate(chunked_sents), '\n' if args.trace: print 'analyzing chunker coverage of %s with %s\n' % ( args.corpus, chunker.__class__.__name__) iobs_found = FreqDist() sents = corpus.sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: tree = chunker.parse(tagger.tag(sent)) for child in tree.subtrees(lambda t: t.node != 'S'): iobs_found.inc(child.node) iobs = iobs_found.samples() justify = max(7, *[len(iob) for iob in iobs]) print 'IOB'.center(justify) + ' Found '
This/det is/verb the/det first/adj sentence/noun ./punc Here/det is/verb another/adj sentence/noun ./punc Note/verb that/comp you/pron can/verb use/verb any/noun tag/noun set/noun This/det is/verb the/det second/adj paragraph/noun ./punc word/n without/adj a/det tag/noun :/: hello ./punc """, b=""" This/det is/verb the/det second/adj file/noun ./punc """) corpus = TaggedCorpusReader(root, list('ab')) print(corpus.fileids()) print(str(corpus.root) == str(root)) print(corpus.words()) print(corpus.sents()) # doctest: +ELLIPSIS print(corpus.paras()) # doctest: +ELLIPSIS print(corpus.tagged_words()) # doctest: +ELLIPSIS print(corpus.tagged_sents()) # doctest: +ELLIPSIS print(corpus.tagged_paras()) # doctest: +ELLIPSIS print(corpus.raw()[:40]) print(len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()]) print(len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()]) print(len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()]) print(corpus.words('a')) print(corpus.words('b')) # del_testcorpus(root) print(brown.fileids()) # doctest: +ELLIPSIS print(brown.categories()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(repr(brown.root).replace('\\\\', '/')) # doctest: +ELLIPSIS print(brown.words())