def get_word_to_posvec():
    word_to_posvec = {}
    for fileid in ptb.fileids('news'):
        for (word, tag) in ptb.tagged_words(fileid, tagset='universal'):
            if word not in word_to_posvec:
                word_to_posvec[word] = [0] * len(_UNIVERSAL_TAGS)
            word_to_posvec[word][tag_to_index[tag]] += 1
    return word_to_posvec
Exemple #2
0
 def test_tagged_words(self):
     self.assertEqual(
         ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
         [('A', 'DT'), ('form', 'NN'), ('of', 'IN')],
     )
 def test_tagged_words(self):
     self.assertEqual(
         ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
         [('A', 'DT'), ('form', 'NN'), ('of', 'IN')]
     )
Exemple #4
0
 def test_tagged_words(self):
     self.assertEqual(
         ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3],
         [("A", "DT"), ("form", "NN"), ("of", "IN")],
     )
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import ptb

tagged_corpus = ptb.tagged_words(categories=['news'])

#print(len(tagged_corpus))


def nonWord_strip(x):
    return x != '-NONE-' and x != '-LRB-' and x != '-RRB-' and x != 'SYM' and x != ':' and x != '.' and x != ',' and x != '``' and x != "''"


print("*********       QUESTION 1     ***************")
#
words_without_lst = [x[0] for x in tagged_corpus if nonWord_strip(x[1])]

#words_without_lst2 = [x[0].lower() for x in tagged_corpus if nonWord_strip(x[1])]

print("The number of words without NON-words is ", len(words_without_lst))

words_without_set = set([
    x[0] for x in ptb.tagged_words(categories=['news']) if nonWord_strip(x[1])
])

print("The number of distinct words without NON-words is ",
      len(words_without_set))

print("Lexical diversity is ", len(words_without_set) / len(words_without_lst))
Exemple #6
0
 def parse_file(f):
     for word, tag in ptb.tagged_words(f):
         if tag in common.OPEN_CLASSES:
             add_counts(word, super_model[tag])
         elif tag in common.CLOSED_CLASSES:
             observe_closed(word, super_model[tag])
Exemple #7
0
 def test_tagged_words(self):
     self.assertEqual(ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], [("A", "DT"), ("form", "NN"), ("of", "IN")])
Exemple #8
0
	def parse_file(f):
		for word, tag in ptb.tagged_words(f):
			if tag in common.OPEN_CLASSES:
				add_counts(word, super_model[tag])
			elif tag in common.CLOSED_CLASSES:
				observe_closed(word, super_model[tag])
Exemple #9
0
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text)  # doctest: +ELLIPSIS
# parsed corpora
print(treebank.fileids())  # doctest: +ELLIPSIS
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP
# download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip
# then extract and place to the following location: .../nltk_data/corpora/ptb/
print(ptb.words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
print(ptb.tagged_words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
# print(ptb.categories())  # doctest: +SKIP
# print(ptb.fileids('news'))  # doctest: +SKIP
# print(ptb.words(categories=['humor', 'fiction']))  # doctest: +SKIP
# nltk.download('sinica_treebank')
print(sinica_treebank.sents())  # doctest: +SKIP
print(sinica_treebank.parsed_sents()[25])  # doctest: +SKIP
# nltk.download('conll2007')
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS