Ejemplo n.º 1
0
	def test_tree10_sentence_length_freq(self):
		expected = {1: 159, 2: 340, 3: 377, 4: 518, 5: 614, 6: 737, 7: 878, 8: 1107, 9: 1208, 10: 1484}
		actual = {}
		for t in itree('treebank_wsj10.mrg'):
			length = len(t.leaves())
			if length not in actual:
				actual[length] = 0

			actual[length]+=1

		self.assertEqual(expected, actual)
Ejemplo n.º 2
0
	def test_tree10_postag_freq(self):
		expected = {'PRP$': 412, 'VBG': 735, 'VBD': 2633, 'VBN': 1282, 'VBP': 1361, 'WDT': 66, 
'JJ': 3658, 'WP': 145, 'VBZ': 2320, 'DT': 4586, 'RP': 141, 'NN': 7718, 'FW': 22, 'POS': 332, 'TO': 1183, 'PRP': 2000, 'RB': 3071, 
    'NNS': 3927, 'NNP': 5570, 'VB': 1616, 'WRB': 96, 'CC': 1036, 'LS': 24, 'PDT': 31, 'RBS': 26, 'RBR': 113, 'CD': 3004, 
   'EX': 120, 'IN': 3720, 'MD': 678, 'NNPS': 192, 'JJS': 106, 'JJR': 228, 'SYM': 51, 'UH': 45}
		actual = {}
	
		for t in itree('treebank_wsj10.mrg'):
			for pos in [ pos for _,pos in t.pos() ]:
				if pos not in actual:
					actual[ pos ] = 0

				actual[ pos ]+=1

		self.assertEqual(expected, actual)
Ejemplo n.º 3
0
__author__ = 'husnusensoy'
__treebank__ = "treebank.mrg"
__train__ = 0.80

from math import ceil
from corpus import itree

corpus = [t for t in itree(__treebank__)]

#from treeutil import filterLexical
#for i in range(len(corpus)):
#    filterLexical(corpus[i])

train_size = int(ceil(len(corpus) * __train__))

train_corpus = corpus[:train_size]
test_corpus = corpus[train_size:]

print "Train Corpus: %d Test Corpus: %d" % (len(train_corpus), len(test_corpus))

from itertools import islice
import nltk


def getParser():
    """


    :return: A Viterbi Parser
    """
    productions = []
Ejemplo n.º 4
0
try:
    if len(args.files) == 0:
        for t in itree_stream(sys.stdin):
            j = toJSON(root(t))

            if not args.cnf:
                if args.brush:
                    j = brush(j)
            else:
                j = cnf(brush(j))

            json.dump(j, sys.stdout)
            sys.stdout.write("\n")
    else:
        for f in args.files:
            for t in itree(f):
                j = toJSON(root(t))

                if not args.cnf:
                    if args.brush:
                        j = brush(j)
                else:
                    j = cnf(brush(j))

                json.dump(j, sys.stdout)
                sys.stdout.write("\n")
except IOError:
    pass

Ejemplo n.º 5
0
	def test_tree10_sentence_count(self):
		self.assertEqual(7422, len([t for t  in itree('treebank_wsj10.mrg')]))