def get_word_to_posvec(): word_to_posvec = {} for fileid in ptb.fileids('news'): for (word, tag) in ptb.tagged_words(fileid, tagset='universal'): if word not in word_to_posvec: word_to_posvec[word] = [0] * len(_UNIVERSAL_TAGS) word_to_posvec[word][tag_to_index[tag]] += 1 return word_to_posvec
def test_tagged_words(self): self.assertEqual( ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3], [('A', 'DT'), ('form', 'NN'), ('of', 'IN')], )
def test_tagged_words(self): self.assertEqual( ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3], [('A', 'DT'), ('form', 'NN'), ('of', 'IN')] )
def test_tagged_words(self): self.assertEqual( ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], [("A", "DT"), ("form", "NN"), ("of", "IN")], )
# -*- coding: utf-8 -*- import nltk from nltk.corpus import ptb tagged_corpus = ptb.tagged_words(categories=['news']) #print(len(tagged_corpus)) def nonWord_strip(x): return x != '-NONE-' and x != '-LRB-' and x != '-RRB-' and x != 'SYM' and x != ':' and x != '.' and x != ',' and x != '``' and x != "''" print("********* QUESTION 1 ***************") # words_without_lst = [x[0] for x in tagged_corpus if nonWord_strip(x[1])] #words_without_lst2 = [x[0].lower() for x in tagged_corpus if nonWord_strip(x[1])] print("The number of words without NON-words is ", len(words_without_lst)) words_without_set = set([ x[0] for x in ptb.tagged_words(categories=['news']) if nonWord_strip(x[1]) ]) print("The number of distinct words without NON-words is ", len(words_without_set)) print("Lexical diversity is ", len(words_without_set) / len(words_without_lst))
def parse_file(f): for word, tag in ptb.tagged_words(f): if tag in common.OPEN_CLASSES: add_counts(word, super_model[tag]) elif tag in common.CLOSED_CLASSES: observe_closed(word, super_model[tag])
def test_tagged_words(self): self.assertEqual(ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], [("A", "DT"), ("form", "NN"), ("of", "IN")])
print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS # parsed corpora print(treebank.fileids()) # doctest: +ELLIPSIS print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS