def test_tree10_sentence_length_freq(self): expected = {1: 159, 2: 340, 3: 377, 4: 518, 5: 614, 6: 737, 7: 878, 8: 1107, 9: 1208, 10: 1484} actual = {} for t in itree('treebank_wsj10.mrg'): length = len(t.leaves()) if length not in actual: actual[length] = 0 actual[length]+=1 self.assertEqual(expected, actual)
def test_tree10_postag_freq(self): expected = {'PRP$': 412, 'VBG': 735, 'VBD': 2633, 'VBN': 1282, 'VBP': 1361, 'WDT': 66, 'JJ': 3658, 'WP': 145, 'VBZ': 2320, 'DT': 4586, 'RP': 141, 'NN': 7718, 'FW': 22, 'POS': 332, 'TO': 1183, 'PRP': 2000, 'RB': 3071, 'NNS': 3927, 'NNP': 5570, 'VB': 1616, 'WRB': 96, 'CC': 1036, 'LS': 24, 'PDT': 31, 'RBS': 26, 'RBR': 113, 'CD': 3004, 'EX': 120, 'IN': 3720, 'MD': 678, 'NNPS': 192, 'JJS': 106, 'JJR': 228, 'SYM': 51, 'UH': 45} actual = {} for t in itree('treebank_wsj10.mrg'): for pos in [ pos for _,pos in t.pos() ]: if pos not in actual: actual[ pos ] = 0 actual[ pos ]+=1 self.assertEqual(expected, actual)
__author__ = 'husnusensoy' __treebank__ = "treebank.mrg" __train__ = 0.80 from math import ceil from corpus import itree corpus = [t for t in itree(__treebank__)] #from treeutil import filterLexical #for i in range(len(corpus)): # filterLexical(corpus[i]) train_size = int(ceil(len(corpus) * __train__)) train_corpus = corpus[:train_size] test_corpus = corpus[train_size:] print "Train Corpus: %d Test Corpus: %d" % (len(train_corpus), len(test_corpus)) from itertools import islice import nltk def getParser(): """ :return: A Viterbi Parser """ productions = []
try: if len(args.files) == 0: for t in itree_stream(sys.stdin): j = toJSON(root(t)) if not args.cnf: if args.brush: j = brush(j) else: j = cnf(brush(j)) json.dump(j, sys.stdout) sys.stdout.write("\n") else: for f in args.files: for t in itree(f): j = toJSON(root(t)) if not args.cnf: if args.brush: j = brush(j) else: j = cnf(brush(j)) json.dump(j, sys.stdout) sys.stdout.write("\n") except IOError: pass
def test_tree10_sentence_count(self): self.assertEqual(7422, len([t for t in itree('treebank_wsj10.mrg')]))