Exemple #1
0
    def test_mergedicsnodes(self):
        tree = Tree.parse(
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)'
            '(VVPP 5)) (VAINF 6)) (VMFIN 3))',
            parse_leaf=int)
        assert str(mergediscnodes(splitdiscnodes(tree))) == (
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
            '(VAINF 6)) (VMFIN 3))')

        assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
            '(VAINF 6)) (VMFIN 3))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(mergediscnodes(splitdiscnodes(
            tree, markorigin=True))) == ('(S (X (A 0) (A 2)) (X (A 1) (A 3)))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(splitdiscnodes(tree, markorigin=True)) == (
            '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(mergediscnodes(
            splitdiscnodes(tree))) == ('(S (X (A 0) (A 1) (A 2) (A 3)))')
Exemple #2
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Exemple #3
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Exemple #4
0
def brackettree(treestr, sent, brackets, strtermre):
	""" Parse a single tree presented in bracket format, whether with indices
	or not; sent may be None / empty. """
	if strtermre.search(treestr):  # terminals are not all indices
		treestr = FRONTIERNTRE.sub(' ...)', treestr)
		sent = TERMINALSRE.findall(treestr)
		cnt = count()
		tree = Tree.parse(treestr, brackets=brackets,
				parse_leaf=lambda x: next(cnt))
	else:  # disc. trees with integer indices as terminals
		tree = Tree.parse(treestr, parse_leaf=int,
			brackets=brackets)
		sent = (sent.split() if sent.strip()
				else map(str, range(max(tree.leaves()) + 1)))
	return tree, sent
Exemple #5
0
def test_fragments():
    from discodop._fragments import getctrees, extractfragments, exactcounts
    treebank = """\
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The cat saw the hungry dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat saw the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\
	The mouse saw the yellow cat
(S (NP (DT 0) (JJ 1) (NN 2)) (VP (VBP 3) (NP (DT 4) (NN 5))))\
	The little mouse saw the cat
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The cat ate the dog
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\
	The mouse ate the cat""".splitlines()
    trees = [binarize(Tree(line.split('\t')[0])) for line in treebank]
    sents = [line.split('\t')[1].split() for line in treebank]
    for tree in trees:
        for n, idx in enumerate(tree.treepositions('leaves')):
            tree[idx] = n
    params = getctrees(zip(trees, sents))
    fragments = extractfragments(params['trees1'],
                                 0,
                                 0,
                                 params['vocab'],
                                 disc=True,
                                 approx=False)
    counts = exactcounts(list(fragments.values()), params['trees1'],
                         params['trees1'])
    assert len(fragments) == 25
    assert sum(counts) == 100
Exemple #6
0
	def decorate(self, tree, sent):
		"""Return a copy of tree with labels decorated with IDs.

		>>> d = TreeDecorator()
		>>> tree = Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", parse_leaf=int)
		>>> d.decorate(tree, ['the', 'dog', 'walks'])
		... # doctest: +NORMALIZE_WHITESPACE
		Tree('S', [Tree('NP@1-0', [Tree('DT@1-1', [0]),
			Tree('N@1-2', [1])]), Tree('VP@1-3', [2])])
		>>> d = TreeDecorator(memoize=True)
		>>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))",
		...		parse_leaf=int), ['the', 'dog', 'walks']))
		(S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@1-4 2))
		>>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))",
		...		parse_leaf=int), ['the', 'dog', 'barks']))
		(S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@2-4 2))"""
		if self.memoize:
			self.ids = 0
			# wrap tree to get equality wrt sent
			tree = DiscTree(tree.freeze(), sent)
			dectree = ImmutableTree(tree.label, map(self._recdecorate, tree))
		else:
			dectree = Tree.convert(tree.copy(True))
			# skip top node, should not get an ID
			for m, a in enumerate(islice(dectree.subtrees(), 1, None)):
				a.label = "%s@%d-%d" % (a.label, self.n, m)
		self.n += 1
		return dectree
Exemple #7
0
	def __init__(self, tree, sent=None, highlight=(), abbr=False):
		self.tree = tree
		self.sent = sent
		if isinstance(tree, basestring):
			self.tree = Tree.parse(tree,
					parse_leaf=None if sent is None else int)
		if sent is None:
			leaves = self.tree.leaves()
			if (leaves and not any(len(a) == 0 for a in self.tree.subtrees())
					and all(isinstance(a, int) for a in leaves)):
				self.sent = [str(a) for a in leaves]
			else:
				# this deals with empty nodes (frontier non-terminals)
				# and multiple/mixed terminals under non-terminals.
				self.tree = self.tree.copy(True)
				self.sent = []
				for a in self.tree.subtrees():
					if len(a) == 0:
						a.append(len(self.sent))
						self.sent.append(None)
					elif any(not isinstance(b, Tree) for b in a):
						for n, b in enumerate(a):
							if not isinstance(b, Tree):
								a[n] = len(self.sent)
								self.sent.append('%s' % b)
		if abbr:
			if self.tree is tree:
				self.tree = self.tree.copy(True)
			for n in self.tree.subtrees(lambda x: len(x.label) > 5):
				n.label = n.label[:4] + u'\u2026'  # unicode '...' ellipsis
		self.highlight = set()
		self.nodes, self.coords, self.edges = self.nodecoords(
				self.tree, self.sent, highlight)
Exemple #8
0
    def test_binarize(self):
        treestr = '(S (VP (PDS 0) (ADV 3) (VVINF 4)) (VMFIN 1) (PIS 2))'
        origtree = Tree(treestr)
        tree = Tree(treestr)
        tree[1].type = HEAD  # set VMFIN as head
        assert str(binarize(tree, horzmarkov=0)) == (
            '(S (VP (PDS 0) (VP|<> (ADV 3) (VVINF 4))) (S|<> (VMFIN 1)'
            ' (PIS 2)))')
        assert unbinarize(tree) == origtree

        assert str(binarize(tree, horzmarkov=1)) == (
            '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VVINF 4))) (S|<VMFIN> '
            '(VMFIN 1) (PIS 2)))')
        assert unbinarize(tree) == origtree

        assert str(
            binarize(tree,
                     horzmarkov=1,
                     leftmostunary=False,
                     rightmostunary=True,
                     headoutward=True)
        ) == ('(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VP|<VVINF> (VVINF 4)))) '
              '(S|<VMFIN> (S|<VMFIN> (VMFIN 1)) (PIS 2)))')
        assert unbinarize(tree) == origtree

        assert str(
            binarize(tree,
                     horzmarkov=1,
                     leftmostunary=True,
                     rightmostunary=False,
                     headoutward=True)) == (
                         '(S (S|<VP> (VP (VP|<PDS> (PDS 0) (VP|<ADV> (ADV 3) '
                         '(VVINF 4)))) (S|<VMFIN> (VMFIN 1) (PIS 2))))')
        assert unbinarize(tree) == origtree

        assert str(
            binarize(tree, factor='left', horzmarkov=2, headoutward=True)) == (
                '(S (S|<VMFIN,PIS> (VP (VP|<PDS,ADV> (PDS 0) (ADV 3)) '
                '(VVINF 4)) (VMFIN 1)) (PIS 2))')
        assert unbinarize(tree) == origtree

        tree = Tree('(S (A 0) (B 1) (C 2) (D 3) (E 4) (F 5))')
        assert str(binarize(tree, headoutward=True)) == (
            '(S (A 0) (S|<B,C,D,E,F> (B 1) (S|<C,D,E,F> (C 2) (S|<D,E,F> '
            '(D 3) (S|<E,F> (E 4) (F 5))))))')
Exemple #9
0
def optimalbinarize(tree, sep='|', headdriven=False, h=None, v=1):
	""" Recursively binarize a tree, optimizing for complexity.
	v=0 is not implemented. Setting h to a nonzero integer restricts the
	possible binarizations to head driven binarizations. """
	if h is None:
		tree = Tree.convert(tree)
		for a in list(tree.subtrees(lambda x: len(x) > 1))[::-1]:
			a.sort(key=lambda x: x.leaves())
	return recbinarizetree(addbitsets(tree), sep, headdriven, h or 999, v, ())
Exemple #10
0
def test():
	""" Simple demonstration. """
	a = Tree.parse("(f (d (a 0) (c (b 1))) (e 2))", parse_leaf=int)
	b = Tree.parse("(f (c (d (a 0) (b 1)) (e 2)))", parse_leaf=int)
	result1 = treedist(a, b, debug=True)
	assert result1 == 2
	print('%s\n%s\ndistance: %d' % (a, b, result1))
	result2 = newtreedist(a, b, debug=True)
	assert result2 == 2
	print('%s\n%s\ndistance: %d' % (a, b, result2))
	a = Tree.parse("(f (d (x (a 0)) (b 1) (c 2)) (z 3))", parse_leaf=int)
	b = Tree.parse("(f (c (d (a 0) (x (b 1)) (c 2)) (z 3)))", parse_leaf=int)
	result1 = treedist(a, b, debug=True)
	assert result1 == 3
	print('%s\n%s\ndistance: %d' % (a, b, result1))
	result2 = newtreedist(a, b, debug=True)
	assert result2 == 3
	print('%s\n%s\ndistance: %d' % (a, b, result2))
Exemple #11
0
def dobinarization(trees, sents, binarization, relationalrealizational):
	"""Apply binarization."""
	# fixme: this n should correspond to sentence id
	tbfanout, n = treebank.treebankfanout(trees)
	logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
			tbfanout, n, trees[n], ' '.join(sents[n]))
	# binarization
	begin = time.clock()
	msg = 'binarization: %s' % binarization.method
	if binarization.fanout_marks_before_bin:
		trees = [treetransforms.addfanoutmarkers(t) for t in trees]
	if binarization.method is None:
		pass
	elif binarization.method == 'default':
		msg += ' %s h=%d v=%d %s' % (
				binarization.factor, binarization.h, binarization.v,
				'tailmarker' if binarization.tailmarker else '')
		for a in trees:
			treetransforms.binarize(a, factor=binarization.factor,
					tailmarker=binarization.tailmarker,
					horzmarkov=binarization.h, vertmarkov=binarization.v,
					leftmostunary=binarization.leftmostunary,
					rightmostunary=binarization.rightmostunary,
					reverse=binarization.revmarkov,
					headidx=-1 if binarization.markhead else None,
					filterfuncs=(relationalrealizational['ignorefunctions']
						+ (relationalrealizational['adjunctionlabel'], ))
						if relationalrealizational else (),
					labelfun=binarization.labelfun)
	elif binarization.method == 'optimal':
		trees = [Tree.convert(treetransforms.optimalbinarize(tree))
						for n, tree in enumerate(trees)]
	elif binarization.method == 'optimalhead':
		msg += ' h=%d v=%d' % (
				binarization.h, binarization.v)
		trees = [Tree.convert(treetransforms.optimalbinarize(
				tree, headdriven=True, h=binarization.h, v=binarization.v))
				for n, tree in enumerate(trees)]
	trees = [treetransforms.addfanoutmarkers(t) for t in trees]
	logging.info('%s; cpu time elapsed: %gs',
			msg, time.clock() - begin)
	trees = [treetransforms.canonicalize(a).freeze() for a in trees]
	return trees
Exemple #12
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers
    from discodop.disambiguation import getderivations, marginalize
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, _backtransform, _, _ = doubledop(trees,
                                               sents,
                                               debug=False,
                                               numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(None,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    result, msg = grammar.testgrammar()
    assert result, 'RFE should sum to 1.\n%s' % msg
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print('sentence:',
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            getderivations(chart, 100)
            _parses, _msg = marginalize('mpp', chart)
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemple #13
0
	def test_mergedicsnodes(self):
		tree = Tree.parse('(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)'
				'(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree))) == (
				'(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
				'(VAINF 6)) (VMFIN 3))')

		assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
				'(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
				'(VAINF 6)) (VMFIN 3))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
				'(S (X (A 0) (A 2)) (X (A 1) (A 3)))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(splitdiscnodes(tree, markorigin=True)) == (
				'(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree))) == (
				'(S (X (A 0) (A 1) (A 2) (A 3)))')
Exemple #14
0
	def noparse(self, stage, sent, tags, lastsuccessfulparse):
		"""Return parse from previous stage or a dummy parse."""
		# use successful parse from earlier stage if available
		if lastsuccessfulparse is not None:
			parsetree = lastsuccessfulparse.copy(True)
		else:  # Produce a dummy parse for evaluation purposes.
			default = defaultparse([(n, t) for n, t
					in enumerate(tags or (len(sent) * ['NONE']))])
			parsetree = Tree.parse('(%s %s)' % (stage.grammar.start,
					default), parse_leaf=int)
		noparse = True
		prob = 1.0
		return parsetree, prob, noparse
    def get_subtree(self, nt):
        """Return a derivation subtree.

        Parameters
        ----------
        nt : str
            The nonterminal to start with.

        Returns
        -------
        Tree
            The tree with the current node as root.

        """
        edge = self.get_witness(nt)[0]
        if edge is None:
            raise ValueError("There is no witness for %s" % nt)
        if not edge.get_successors():
            stdout.flush()
            return Tree(edge.get_nonterminal(), [self.get_label()[0]])
        else:
            s = edge.get_successors()
            return Tree(edge.get_nonterminal(),
                        [t.get_subtree(n) for t, n in s])
Exemple #16
0
	def postprocess(self, treestr, stage=-1):
		"""Take parse tree and apply postprocessing."""
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':',
					expandunary=False))
		saveheads(parsetree, self.binarization.tailmarker)
		unbinarize(parsetree, expandunary=False)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		return parsetree, False
Exemple #17
0
	def postprocess(self, treestr, stage=-1, derivs=None):
		""" Take parse tree and apply postprocessing. """
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':'))
		saveheads(parsetree, self.tailmarker)
		unbinarize(parsetree)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		fragments = derivs.get(treestr) if derivs else None
		return parsetree, fragments, False
Exemple #18
0
	def trees(self, query, subset=None, maxresults=10,
			nofunc=False, nomorph=False):
		subset = subset or self.files
		# %s the sentence number
		# %w complete tree in bracket notation
		# %h the matched subtree in bracket notation
		fmt = r'%s:::%w:::%h\n'
		result = []
		jobs = {}
		for filename in subset:
			try:
				x, maxresults2 = self.cache['trees', query, filename,
						nofunc, nomorph]
			except KeyError:
				maxresults2 = 0
			if not maxresults or maxresults > maxresults2:
				jobs[self._submit(lambda x: list(self._query(
						query, x, fmt, maxresults)), filename)] = filename
			else:
				result.extend(x[:maxresults])
		for future in self._as_completed(jobs):
			filename = jobs[future]
			x = []
			for sentno, line in future.result():
				treestr, match = line.split(':::')
				treestr = filterlabels(treestr, nofunc, nomorph)
				treestr = treestr.replace(" )", " -NONE-)")
				cnt = count()
				if match.startswith('('):
					treestr = treestr.replace(match, '%s_HIGH %s' % tuple(
							match.split(None, 1)), 1)
				else:
					match = ' %s)' % match
					treestr = treestr.replace(match, '_HIGH%s' % match)
				tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt))
				sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr)
				high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH")))
				if high:
					high = high.pop()
					high.label = high.label.rsplit("_", 1)[0]
					high = list(high.subtrees()) + high.leaves()
				x.append((filename, sentno, tree, sent, high))
			self.cache['trees', query, filename,
					nofunc, nomorph] = x, maxresults
			result.extend(x)
		return result
Exemple #19
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
Exemple #20
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
cp = ConfigParser()
cp.read(argv[1])
config = corpusparam(**cp["Corpus"], **cp["Grammar"])

from discodop.tree import Tree
from discodop.treebank import READERS
from discodop.treetransforms import addfanoutmarkers, binarize, collapseunary
from discodop.lexgrammar import SupertagCorpus, SupertagGrammar

corpus = READERS[config.inputfmt](config.filename, encoding=config.inputenc, punct="move")
trees = [
    addfanoutmarkers(
     binarize(
      collapseunary(
       Tree.convert(t), collapseroot=True, collapsepos=True),
      horzmarkov=config.h, vertmarkov=config.v))
    for t in corpus.trees().values()]
sents = list(corpus.sents().values())

corpus = SupertagCorpus(trees, sents)

size = len(corpus.sent_corpus)
portions = config.split.split()
names = "train dev test".split()
assert len(portions) in [3,4]

if portions[0] == "debug":
    portions = tuple(int(portion) for portion in portions[1:2]+portions[1:])
    limits = tuple((name, slice(0, end)) for name, end in zip(names, portions))
else:
Exemple #22
0
def reattach():
    """Re-draw tree after re-attaching node under new parent."""
    sentno = int(request.args.get('sentno'))  # 1-indexed
    sent = SENTENCES[QUEUE[sentno - 1][0]]
    senttok, _ = worker.postokenize(sent)
    treestr = request.args.get('tree', '')
    try:
        tree, _sent1 = validate(treestr, senttok)
    except ValueError as err:
        return str(err)
    dt = DrawTree(tree, senttok)
    error = ''
    if request.args.get('newparent') == 'deletenode':
        # remove nodeid by replacing it with its children
        _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
        nodeid = int(nodeid)
        x = dt.nodes[nodeid]
        if nodeid == 0 or isinstance(x[0], int):
            error = 'ERROR: cannot remove ROOT or POS node'
        else:
            children = list(x)
            x[:] = []
            for y in dt.nodes[0].subtrees():
                if any(child is x for child in y):
                    i = y.index(x)
                    y[i:i + 1] = children
                    tree = canonicalize(dt.nodes[0])
                    dt = DrawTree(tree, senttok)  # kludge..
                    break
    elif request.args.get('nodeid', '').startswith('newlabel_'):
        # splice in a new node under parentid
        _treeid, newparent = request.args.get('newparent',
                                              '').lstrip('t').split('_')
        newparent = int(newparent)
        label = request.args.get('nodeid').split('_', 1)[1]
        y = dt.nodes[newparent]
        if isinstance(y[0], int):
            error = 'ERROR: cannot add node under POS tag'
        else:
            children = list(y)
            y[:] = []
            y[:] = [Tree(label, children)]
            tree = canonicalize(dt.nodes[0])
            dt = DrawTree(tree, senttok)  # kludge..
    else:  # re-attach existing node at existing new parent
        _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
        nodeid = int(nodeid)
        _treeid, newparent = request.args.get('newparent',
                                              '').lstrip('t').split('_')
        newparent = int(newparent)
        # remove node from old parent
        # dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index)
        x = dt.nodes[nodeid]
        y = dt.nodes[newparent]
        for node in x.subtrees():
            if node is y:
                error = ('ERROR: cannot re-attach subtree'
                         ' under (descendant of) itself\n')
                break
        else:
            for node in dt.nodes[0].subtrees():
                if any(child is x for child in node):
                    if len(node) > 1:
                        node.remove(x)
                        dt.nodes[newparent].append(x)
                        tree = canonicalize(dt.nodes[0])
                        dt = DrawTree(tree, senttok)  # kludge..
                    else:
                        error = ('ERROR: re-attaching only child creates'
                                 ' empty node %s; remove manually\n' % node)
                    break
    treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip()
    link = ('<a href="/annotate/accept?%s">accept this tree</a>' %
            urlencode(dict(sentno=sentno, tree=treestr)))
    if error == '':
        session['actions'][REATTACH] += 1
        session.modified = True
    return Markup('%s\n\n%s%s\t%s' % (link, error,
                                      dt.text(unicodelines=True,
                                              html=True,
                                              funcsep='-',
                                              morphsep='/',
                                              nodeprops='t0'), treestr))
Exemple #23
0
def test_treedraw():
    """Draw some trees. Only tests whether no exception occurs."""
    trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \
				(PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10))
			(S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2)))
			(S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1))
			(top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \
				(adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \
				(pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \
				(noun 14) (noun 15))))))))) (punct 16))
			(top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \
				(adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9))
			(top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \
				(punct 5))
			(top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \
				(noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \
				(det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \
				(sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \
				(det 17)) (noun 18)) (verb 20)))))) (punct 21))
			(top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \
				(pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \
				(noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \
				(prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \
				(adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \
				(ppres (adj 19)) (ppres (adj 21)))) (punct 26))
			(top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \
				(noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \
				(comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \
				(smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \
				(punct 18) (punct 19))
			(top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \
				(adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \
				(noun 7))) (verb 10))))) (punct 11))
			(top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \
				(noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \
				(adj 8) (part 9) (verb 11))))))) (punct 12))
			(top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \
				(cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \
				(noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \
				(adj 15) (noun 16))))))))) (punct 17))
			(top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \
				(np (det 5) (noun 6)) (part 7))) (punct 8))
			(top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \
				(noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \
				(verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \
				(smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \
				(noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \
				(punct 22))
			(top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \
				(noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \
				(prep 9) (noun 10)))) (punct 11))
			(top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \
				(vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \
				(noun 9))))) (punct 10))
			(top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \
				(verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \
				(num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \
				(prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \
				(noun 19))))))))))) (punct 20))
			(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \
				(inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \
				(verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \
				(punct 12))
			(top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \
				(noun 1)) (verb 5))) (punct 6))
			(top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \
				(prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \
				(noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \
				(pp (prep 14) (np (det 15) (noun 16))))) (punct 17))
			(top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \
				(noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \
				(inf (ppart (np (det 8) (noun 9)))))) (punct 10))
			(A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10))  (B3 (t 1) \
				(t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8)))
			(A (B1 6 13) (B2 3 7 10)  (B3 1 \
				9 11 14 16) (B4 0 5 8))
			(VP (VB 0) (PRT 2))
			(VP (VP 0 3) (NP (PRP 1) (NN 2)))
			(ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \
				(ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \
				(VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))'''
    sents = '''Leider stehen diese Fragen nicht im Vordergrund der \
				augenblicklichen Diskussion .
			is Mary happy there
			das muss man jetzt machen
			Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \
				buurt van Trafalgar Square .
			Het had een prachtige dag kunnen zijn in Londen .
			Cathy zag hen wild zwaaien .
			Het was een spel geworden , zij en haar vriendinnen kozen iemand \
				uit en probeerden zijn of haar nationaliteit te raden .
			Elk jaar in het hoogseizoen trokken daar massa's toeristen \
				voorbij , hun fototoestel in de aanslag , pratend , gillend \
				en lachend in de vreemdste talen .
			Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \
				komt wel goed , joch " .
			Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \
				liggen .
			Het hoorde bij de warme zomerdag die ze ginds achter had gelaten .
			De oprijlaan was niet meer dan een hobbelige zandstrook die zich \
				voortslingerde tussen de hoge grijze boomstammen .
			Haar moeder kleefde bijna tegen het autoraampje aan .
			Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \
				op en begaf zich in de richting van het landhuis .
			Het meisje dat vijf keer juist raadde werd getrakteerd op ijs .
			Haar neus werd platgedrukt en leek op een jonge champignon .
			Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \
				een zilveren schijnsel tussen de bomen en struiken .
			Ze had met haar moeder kunnen gaan winkelen , zwemmen of \
				terrassen .
			Dat werkwoord had ze zelf uitgevonden .
			De middagzon hing klein tussen de takken en de schaduwen van de \
				wolken drentelden over het gras .
			Zij zou mams rug ingewreven hebben en mam de hare .
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \
				offiziell zu Ende gegangen .'''
    from discodop.tree import DrawTree
    trees = [Tree(a) for a in trees.splitlines()]
    sents = [a.split() for a in sents.splitlines()]
    sents.extend([['Wake', None, 'up'], [None, 'your', 'friend', None]])
    for n, (tree, sent) in enumerate(zip(trees, sents)):
        drawtree = DrawTree(tree, sent)
        print('\ntree, sent',
              n,
              tree,
              ' '.join('...' if a is None else a for a in sent),
              repr(drawtree),
              sep='\n')
        try:
            print(drawtree.text(unicodelines=True, ansi=True), sep='\n')
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
Exemple #24
0
def getgrammars(trees, sents, stages, testmaxwords, resultdir,
		numproc, lexmodel, simplelexsmooth, top):
	"""Read off the requested grammars."""
	tbfanout, n = treebank.treebankfanout(trees)
	logging.info('binarized treebank fan-out: %d #%d', tbfanout, n)
	for n, stage in enumerate(stages):
		if stage.split:
			traintrees = [treetransforms.binarize(
					treetransforms.splitdiscnodes(
						Tree.convert(a), stage.markorigin),
					childchar=':', dot=True, ids=grammar.UniqueIDs()).freeze()
					for a in trees]
			logging.info('splitted discontinuous nodes')
		else:
			traintrees = trees
		if stage.mode.startswith('pcfg'):
			if tbfanout != 1 and not stage.split:
				raise ValueError('Cannot extract PCFG from treebank '
						'with discontinuities.')
		backtransform = extrarules = None
		if lexmodel and simplelexsmooth:
			extrarules = lexicon.simplesmoothlexicon(lexmodel)
		if stage.dop:
			if stage.dop == 'doubledop':
				(xgrammar, backtransform, altweights, fragments
					) = grammar.doubledop(
						traintrees, sents, binarized=stage.binarized,
						iterate=stage.iterate, complement=stage.complement,
						numproc=numproc, extrarules=extrarules)
				# dump fragments
				with codecs.getwriter('utf-8')(gzip.open('%s/%s.fragments.gz' %
						(resultdir, stage.name), 'w')) as out:
					out.writelines('%s\t%d\n' % (treebank.writetree(a, b, 0,
							'bracket' if stage.mode.startswith('pcfg')
							else 'discbracket').rstrip(), sum(c.values()))
							for (a, b), c in fragments.items())
			elif stage.dop == 'reduction':
				xgrammar, altweights = grammar.dopreduction(
						traintrees, sents, packedgraph=stage.packedgraph,
						extrarules=extrarules)
			else:
				raise ValueError('unrecognized DOP model: %r' % stage.dop)
			nodes = sum(len(list(a.subtrees())) for a in traintrees)
			if lexmodel and not simplelexsmooth:  # FIXME: altweights?
				xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel)
			msg = grammar.grammarinfo(xgrammar)
			rules, lex = grammar.write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			gram = Grammar(rules, lex, start=top,
					bitpar=stage.mode.startswith('pcfg'),
					binarized=stage.binarized)
			for name in altweights:
				gram.register(u'%s' % name, altweights[name])
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lex)
			logging.info('DOP model based on %d sentences, %d nodes, '
				'%d nonterminals', len(traintrees), nodes, len(gram.toid))
			logging.info(msg)
			if stage.estimator != 'rfe':
				gram.switch(u'%s' % stage.estimator)
			logging.info(gram.testgrammar()[1])
			if stage.dop == 'doubledop':
				# backtransform keys are line numbers to rules file;
				# to see them together do:
				# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
				with codecs.getwriter('ascii')(gzip.open(
						'%s/%s.backtransform.gz' % (resultdir, stage.name),
						'w')) as out:
					out.writelines('%s\n' % a for a in backtransform)
				if n and stage.prune:
					msg = gram.getmapping(stages[n - 1].grammar,
						striplabelre=None if stages[n - 1].dop
							else re.compile(b'@.+$'),
						neverblockre=re.compile(b'.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					msg = gram.getmapping(None,
						striplabelre=None,
						neverblockre=re.compile(b'.+}<'),
						splitprune=False, markorigin=False)
				logging.info(msg)
			elif n and stage.prune:  # dop reduction
				msg = gram.getmapping(stages[n - 1].grammar,
					striplabelre=None if stages[n - 1].dop
						and stages[n - 1].dop != 'doubledop'
						else re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					gram.getrulemapping(
							stages[n - 1].grammar, re.compile(br'@[-0-9]+\b'))
				logging.info(msg)
			# write prob models
			np.savez_compressed(  # pylint: disable=no-member
					'%s/%s.probs.npz' % (resultdir, stage.name),
					**{name: mod for name, mod
						in zip(gram.modelnames, gram.models)})
		else:  # not stage.dop
			xgrammar = grammar.treebankgrammar(traintrees, sents,
					extrarules=extrarules)
			logging.info('induced %s based on %d sentences',
				('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
				len(traintrees))
			if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
				logging.info(grammar.grammarinfo(xgrammar))
			else:
				logging.info(grammar.grammarinfo(xgrammar,
						dump='%s/pcdist.txt' % resultdir))
			if lexmodel and not simplelexsmooth:
				xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel)
			rules, lex = grammar.write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			gram = Grammar(rules, lex, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lex)
			logging.info(gram.testgrammar()[1])
			if n and stage.prune:
				msg = gram.getmapping(stages[n - 1].grammar,
					striplabelre=None,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				logging.info(msg)
		logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz',
				resultdir, stage.name,
				',backtransform' if stage.dop == 'doubledop' else '')

		outside = None
		if stage.estimates in ('SX', 'SXlrgaps'):
			if stage.estimates == 'SX' and tbfanout != 1 and not stage.split:
				raise ValueError('SX estimate requires PCFG.')
			elif stage.mode != 'plcfrs':
				raise ValueError('estimates require parser w/agenda.')
			begin = time.clock()
			logging.info('computing %s estimates', stage.estimates)
			if stage.estimates == 'SX':
				outside = estimates.getpcfgestimates(gram, testmaxwords,
						gram.toid[trees[0].label])
			elif stage.estimates == 'SXlrgaps':
				outside = estimates.getestimates(gram, testmaxwords,
						gram.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
					time.clock() - begin)
			np.savez_compressed(  # pylint: disable=no-member
					'%s/%s.outside.npz' % (resultdir, stage.name),
					outside=outside)
			logging.info('saved %s estimates', stage.estimates)
		elif stage.estimates:
			raise ValueError('unrecognized value; specify SX or SXlrgaps.')

		stage.update(grammar=gram, backtransform=backtransform,
				outside=outside)
Exemple #25
0
def test_grammar(debug=False):
	"""Demonstrate grammar extraction."""
	from discodop.grammar import treebankgrammar, dopreduction, doubledop
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from math import exp
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.trees().values())[:10]]
	if debug:
		print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
		print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	if debug:
		print(grammar)
	_ = grammar.testgrammar()

	grammarx, backtransform, _, _ = doubledop(trees, sents,
			debug=debug, numproc=1)
	if debug:
		print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	if debug:
		print(grammar)
	assert grammar.testgrammar()[0], "RFE should sum to 1."
	for tree, sent in zip(corpus.trees().values(), sents):
		if debug:
			print("sentence:", ' '.join(a.encode('unicode-escape').decode()
					for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		if debug:
			print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
		if chart:
			mpp, parsetrees = {}, {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.key, chart, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			if debug:
				print(len(mpp), 'parsetrees',
						sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				if debug:
					print(tp, t, '\nmatch:', t == str(tree))
				if len(set(parsetrees[t])) != len(parsetrees[t]):
					print('chart:\n', chart)
					assert len(set(parsetrees[t])) == len(parsetrees[t])
				if debug:
					for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
						print(' <= %6g %s' % (exp(-p), deriv))
		elif debug:
			print('no parse\n', chart)
		if debug:
			print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemple #26
0
def test_allfragments():
    from discodop.fragments import recurringfragments
    model = """\
(DT the)	1
(DT The)	1
(JJ hungry)	1
(NN cat)	1
(NN dog)	1
(NP|<DT.JJ,NN> (JJ hungry) (NN ))	1
(NP|<DT.JJ,NN> (JJ hungry) (NN dog))	1
(NP|<DT.JJ,NN> (JJ ) (NN ))	1
(NP|<DT.JJ,NN> (JJ ) (NN dog))	1
(NP (DT ) (NN ))	1
(NP (DT ) (NN cat))	1
(NP (DT ) (NP|<DT.JJ,NN> ))	1
(NP (DT ) (NP|<DT.JJ,NN> (JJ hungry) (NN )))	1
(NP (DT ) (NP|<DT.JJ,NN> (JJ hungry) (NN dog)))	1
(NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN )))	1
(NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN dog)))	1
(NP (DT The) (NN ))	1
(NP (DT The) (NN cat))	1
(NP (DT the) (NP|<DT.JJ,NN> ))	1
(NP (DT the) (NP|<DT.JJ,NN> (JJ hungry) (NN )))	1
(NP (DT the) (NP|<DT.JJ,NN> (JJ hungry) (NN dog)))	1
(NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN )))	1
(NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN dog)))	1
(S (NP (DT ) (NN cat)) (VP ))	1
(S (NP (DT ) (NN cat)) (VP (VBP ) (NP )))	1
(S (NP (DT ) (NN cat)) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT ) (NN cat)) (VP (VBP saw) (NP )))	1
(S (NP (DT ) (NN cat)) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT ) (NN )) (VP ))	1
(S (NP (DT ) (NN )) (VP (VBP ) (NP )))	1
(S (NP (DT ) (NN )) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT ) (NN )) (VP (VBP saw) (NP )))	1
(S (NP (DT ) (NN )) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT The) (NN cat)) (VP ))	1
(S (NP (DT The) (NN cat)) (VP (VBP ) (NP )))	1
(S (NP (DT The) (NN cat)) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP )))	1
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT The) (NN )) (VP ))	1
(S (NP (DT The) (NN )) (VP (VBP ) (NP )))	1
(S (NP (DT The) (NN )) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP (DT The) (NN )) (VP (VBP saw) (NP )))	1
(S (NP (DT The) (NN )) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP ) (VP ))	1
(S (NP ) (VP (VBP ) (NP )))	1
(S (NP ) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(S (NP ) (VP (VBP saw) (NP )))	1
(S (NP ) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))))	1
(VBP saw)	1
(VP (VBP ) (NP ))	1
(VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))	1
(VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN ))))	1
(VP (VBP ) (NP (DT the) (NP|<DT.JJ,NN> )))	1
(VP (VBP ) (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN ))))	1
(VP (VBP saw) (NP ))	1
(VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))	1
(VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN ))))	1
(VP (VBP saw) (NP (DT the) (NP|<DT.JJ,NN> )))	1
(VP (VBP saw) (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN ))))	1"""
    model = {
        a.split('\t')[0]: int(a.split('\t')[1])
        for a in model.splitlines()
    }
    answers = recurringfragments([
        Tree('(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) '
             '(NP|<DT.JJ,NN> (JJ 4) (NN 5)))))')
    ], [['The', 'cat', 'saw', 'the', 'hungry', 'dog']],
                                 disc=False,
                                 indices=False,
                                 maxdepth=3,
                                 maxfrontier=999)
    assert model
    assert answers
    assert answers == model
Exemple #27
0
def test():
	""" Run some tests. """
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import binarize, unbinarize, \
			addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from discodop.fragments import getfragments
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	filename = "alpinosample.export"
	corpus = NegraCorpusReader('.', filename, punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.parsed_sents().values())[:10]]

	print('plcfrs')
	lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label)
	print(lcfrs)

	print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	print(grammar)
	grammar.testgrammar()

	fragments = getfragments(trees, sents, 1)
	debug = '--debug' in sys.argv
	grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug)
	print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	print(grammar)
	assert grammar.testgrammar(), "DOP1 should sum to 1."
	for tree, sent in zip(corpus.parsed_sents().values(), sents):
		print("sentence:", ' '.join(a.encode('unicode-escape').decode()
				for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		print('\n', msg, end='')
		print("\ngold ", tree)
		print("double dop", end='')
		if chart:
			mpp = {}
			parsetrees = {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.getkey(), chart,
					grammar, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			print(len(mpp), 'parsetrees', end='')
			print(sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				print(tp, '\n', t, end='')
				print("match:", t == str(tree))
				assert len(set(parsetrees[t])) == len(parsetrees[t])
				if not debug:
					continue
				for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
					print(' <= %6g %s' % (exp(-p), deriv))
		else:
			print("no parse")
			print(chart)
		print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemple #28
0
def parse():
    """Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
    sent = request.args.get('sent', None)
    objfun = request.args.get('objfun', 'mpp')
    est = request.args.get('est', 'rfe')
    marg = request.args.get('marg', 'nbest')
    coarse = request.args.get('coarse', 'pcfg')
    html = 'html' in request.args
    lang = request.args.get('lang', 'detect')
    require = request.args.get('require', None)
    block = request.args.get('block', None)
    if not sent:
        return ''
    nbest = None
    if POSTAGS.match(sent):
        senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split()))
    else:
        senttok, tags = tuple(tokenize(sent)), None
    if not senttok or not 1 <= len(senttok) <= LIMIT:
        return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
    if lang == 'detect':
        lang = guesslang(senttok)
    elif lang not in PARSERS:
        return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
    if require:
        require = tuple((label, tuple(indices))
                        for label, indices in sorted(json.loads(require)))
    if block:
        block = tuple((label, tuple(indices))
                      for label, indices in sorted(json.loads(block)))
    key = (senttok, tags, est, marg, objfun, coarse, lang, require, block)
    resp = CACHE.get(key)
    if resp is None:
        urlparams = dict(sent=sent,
                         lang=lang,
                         est=est,
                         marg=marg,
                         objfun=objfun,
                         coarse=coarse,
                         html=html)
        if require:
            urlparams['require'] = json.dumps(require)
        if block:
            urlparams['block'] = json.dumps(block)
        link = '?' + url_encode(urlparams)
        PARSERS[lang].stages[-1].estimator = est
        PARSERS[lang].stages[-1].objective = objfun
        PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
        PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
        if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
            PARSERS[lang].stages[0].mode = ('pcfg' if coarse
                                            == 'pcfg-posterior' else coarse)
            if len(PARSERS[lang].stages) > 1:
                PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior'
                                             else 50)
        results = list(PARSERS[lang].parse(senttok,
                                           tags=tags,
                                           require=require,
                                           block=block))
        if SHOWMORPH:
            replacemorph(results[-1].parsetree)
        if SHOWFUNC:
            treebank.handlefunctions('add', results[-1].parsetree, pos=True)
        tree = str(results[-1].parsetree)
        prob = results[-1].prob
        parsetrees = results[-1].parsetrees or []
        parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
        parsetrees_ = []
        LOG.info('[%s] %s', probstr(prob), tree)
        tree = Tree.parse(tree, parse_leaf=int)
        result = Markup(
            DrawTree(tree, senttok).text(unicodelines=True,
                                         html=html,
                                         funcsep='-'))
        for tree, prob, x in parsetrees:
            tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
            if SHOWMORPH:
                replacemorph(tree)
            if SHOWFUNC:
                treebank.handlefunctions('add', tree, pos=True)
            parsetrees_.append((tree, prob, x))
        if PARSERS[lang].headrules:
            xtree = PARSERS[lang].postprocess(parsetrees[0][0], senttok, -1)[0]
            dep = treebank.writedependencies(xtree, senttok, 'conll')
            depsvg = Markup(DrawDependencies.fromconll(dep).svg())
        else:
            dep = depsvg = ''
        rid = randid()
        nbest = Markup('\n\n'.join(
            '%d. [%s] '
            '<a href=\'javascript: toggle("f%s%d"); \'>'
            'derivation</a>\n'
            '<span id=f%s%d style="display: none; margin-left: 3em; ">'
            'Fragments used in the highest ranked derivation'
            ' of this parse tree:\n%s</span>\n%s' % (
                n + 1,
                probstr(prob),
                rid,
                n + 1,
                rid,
                n + 1,
                '\n\n'.join(
                    '%s\n%s' %
                    (w, DrawTree(frag).text(unicodelines=True, html=html))
                    for frag, w in fragments or ()  # if frag.count('(') > 1
                ),
                DrawTree(tree, senttok).text(
                    unicodelines=True, html=html, funcsep='-'))
            for n, (tree, prob, fragments) in enumerate(parsetrees_)))
        deriv = Markup(
            'Fragments used in the highest ranked derivation'
            ' of best parse tree:\n%s' % (
                '\n\n'.join(
                    '%s\n%s' %
                    (w, DrawTree(frag).text(unicodelines=True, html=html))
                    for frag, w in parsetrees_[0][2] or ()
                    # if frag.count('(') > 1
                ))) if parsetrees_ else ''
        msg = '\n'.join(stage.msg for stage in results)
        elapsed = [stage.elapsedtime for stage in results]
        elapsed = 'CPU time elapsed: %s => %gs' % (' '.join(
            '%gs' % a for a in elapsed), sum(elapsed))
        info = '\n'.join((
            'length: %d; lang=%s; est=%s; objfun=%s; marg=%s' %
            (len(senttok), lang, est, objfun, marg), msg, elapsed,
            '10 most probable parse trees:',
            ''.join('%d. [%s] %s' %
                    (n + 1, probstr(prob), writediscbrackettree(tree, senttok))
                    for n, (tree, prob, _) in enumerate(parsetrees)) + '\n'))
        CACHE.set(key, (sent, result, nbest, deriv, info, link, dep, depsvg),
                  timeout=5000)
    else:
        (sent, result, nbest, deriv, info, link, dep, depsvg) = resp
    if html:
        return render_template('parsetree.html',
                               sent=sent,
                               result=result,
                               nbest=nbest,
                               deriv=deriv,
                               info=info,
                               link=link,
                               dep=dep,
                               depsvg=depsvg,
                               randid=randid())
    else:
        return Response('\n'.join((nbest, info, result)),
                        mimetype='text/plain')
Exemple #29
0
def parse():
	"""Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
	sent = request.args.get('sent', None)
	est = request.args.get('est', 'rfe')
	marg = request.args.get('marg', 'nbest')
	objfun = request.args.get('objfun', 'mpp')
	coarse = request.args.get('coarse', None)
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	if not sent:
		return ''
	frags = nbest = None
	senttok = tokenize(sent)
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	key = (senttok, est, marg, objfun, coarse, lang)
	resp = CACHE.get(key)
	if resp is None:
		link = 'parse?' + url_encode(dict(sent=sent, est=est, marg=marg,
				objfun=objfun, coarse=coarse, html=html))
		PARSERS[lang].stages[-1].estimator = est
		PARSERS[lang].stages[-1].objective = objfun
		PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
		PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
		if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
			PARSERS[lang].stages[0].mode = coarse
			PARSERS[lang].stages[1].k = (1e-5
					if coarse == 'pcfg-posterior' else 50)

		results = list(PARSERS[lang].parse(senttok))
		if results[-1].noparse:
			parsetrees = []
			result = 'no parse!'
			frags = nbest = ''
		else:
			if SHOWMORPH:
				for node in results[-1].parsetree.subtrees(
						lambda n: n and not isinstance(n[0], Tree)):
					treebank.handlemorphology(
							'replace', None, node, node.source)
					node.label = node.label.replace('[]', '')
			if SHOWFUNC:
				treebank.handlefunctions('add', results[-1].parsetree, pos=True)
			tree = str(results[-1].parsetree)
			prob = results[-1].prob
			parsetrees = results[-1].parsetrees or []
			parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
			parsetrees_ = []
			fragments = results[-1].fragments or ()
			APP.logger.info('[%s] %s', probstr(prob), tree)
			tree = Tree.parse(tree, parse_leaf=int)
			result = Markup(DrawTree(tree, senttok).text(
					unicodelines=True, html=html, funcsep='-'))
			frags = Markup('Phrasal fragments used in the most probable '
					'derivation of the highest ranked parse tree:\n'
					+ '\n\n'.join(
					DrawTree(frag).text(unicodelines=True, html=html)
					for frag in fragments if frag.count('(') > 1))
			for tree, prob, x in parsetrees:
				tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
				if SHOWMORPH:
					for node in tree.subtrees(
							lambda n: n and not isinstance(n[0], Tree)):
						treebank.handlemorphology(
								'replace', None, node, node.source)
				if SHOWFUNC:
					treebank.handlefunctions('add', tree, pos=True)
				parsetrees_.append((tree, prob, x))
			nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob),
						DrawTree(tree, senttok).text(
							unicodelines=True, html=html, funcsep='-'))
					for n, (tree, prob, _) in enumerate(parsetrees_)))
		msg = '\n'.join(stage.msg for stage in results)
		elapsed = [stage.elapsedtime for stage in results]
		elapsed = 'CPU time elapsed: %s => %gs' % (
				' '.join('%gs' % a for a in elapsed), sum(elapsed))
		info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
				len(senttok), lang, est, objfun, marg), msg, elapsed,
				'10 most probable parse trees:',
				'\n'.join('%d. [%s] %s' % (n + 1, probstr(prob),
						writediscbrackettree(tree, senttok))
						for n, (tree, prob, _) in enumerate(parsetrees))
				+ '\n'))
		CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000)
	else:
		(sent, result, frags, nbest,  # pylint: disable=unpacking-non-sequence
				info, link) = resp  # pylint: disable=unpacking-non-sequence
	if html:
		return render_template('parsetree.html', sent=sent, result=result,
				frags=frags, nbest=nbest, info=info, link=link,
				randid=randid())
	else:
		return Response('\n'.join((nbest, frags, info, result)),
				mimetype='text/plain')
Exemple #30
0
    def evaluate(self,
                 sentences: SupertagParseDataset,
                 mini_batch_size: int = 32,
                 num_workers: int = 1,
                 embedding_storage_mode: str = "none",
                 out_path=None,
                 only_disc: str = "both",
                 accuracy: str = "both",
                 pos_accuracy: bool = True,
                 return_loss: bool = True) -> Tuple[Result, float]:
        """ Predicts supertags, pos tags and parse trees, and reports the
            predictions scores for a set of sentences.
            :param sentences: a ``DataSet`` of sentences. For each sentence
                a gold parse tree is expected as value of the `tree` label, as
                provided by ``SupertagParseDataset``.
            :param only_disc: If set, overrides the setting `DISC_ONLY` in the
                evaluation parameter file ``self.evalparam``, i.e. only evaluates
                discontinuous constituents if True. Pass "both" to report both
                results.
            :param accuracy: either 'none', 'best', 'kbest' or 'both'.
                Determines if the accuracy is computed from the best, or k-best
                predicted tags.
            :param pos_accuracy: if set, reports acc. of predicted pos tags.
            :param return_loss: if set, nll loss wrt. gold tags is reported,
                otherwise the second component in the returned tuple is 0.
            :returns: tuple with evaluation ``Result``, where the main score
                is the f1-score (for all constituents, if only_disc == "both").
        """
        from flair.datasets import DataLoader
        from discodop.tree import ParentedTree, Tree
        from discodop.treetransforms import unbinarize, removefanoutmarkers
        from discodop.eval import Evaluator, readparam
        from timeit import default_timer
        from collections import Counter

        if self.__evalparam__ is None:
            raise Exception(
                "Need to specify evaluator parameter file before evaluating")
        if only_disc == "both":
            evaluators = {
                "F1-all": Evaluator({
                    **self.evalparam, "DISC_ONLY": False
                }),
                "F1-disc": Evaluator({
                    **self.evalparam, "DISC_ONLY": True
                })
            }
        else:
            mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else (
                only_disc == "true")
            strmode = "F1-disc" if mode else "F1-all"
            evaluators = {
                strmode: Evaluator({
                    **self.evalparam, "DISC_ONLY": mode
                })
            }

        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        # predict supertags and parse trees
        eval_loss = 0
        start_time = default_timer()
        for batch in data_loader:
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                supertag_storage_mode=accuracy,
                                postag_storage_mode=pos_accuracy,
                                label_name='predicted',
                                return_loss=return_loss)
            eval_loss += loss if return_loss else 0
        end_time = default_timer()

        i = 0
        batches = 0
        noparses = 0
        acc_ctr = Counter()
        for batch in data_loader:
            for sentence in batch:
                for token in sentence:
                    if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \
                            (l.value for l in token.get_tags_proba_dist('predicted-supertag')):
                        acc_ctr["kbest"] += 1
                    if accuracy in ("best", "both") and token.get_tag("supertag").value == \
                            token.get_tag('predicted-supertag').value:
                        acc_ctr["best"] += 1
                    if pos_accuracy and token.get_tag(
                            "pos").value == token.get_tag(
                                "predicted-pos").value:
                        acc_ctr["pos"] += 1
                acc_ctr["all"] += len(sentence)
                sent = [token.text for token in sentence]
                gold = Tree(sentence.get_labels("tree")[0].value)
                gold = ParentedTree.convert(
                    unbinarize(removefanoutmarkers(gold)))
                parse = Tree(sentence.get_labels("predicted")[0].value)
                parse = ParentedTree.convert(
                    unbinarize(removefanoutmarkers(parse)))
                if parse.label == "NOPARSE":
                    noparses += 1
                for evaluator in evaluators.values():
                    evaluator.add(i, gold.copy(deep=True), list(sent),
                                  parse.copy(deep=True), list(sent))
                i += 1
            batches += 1
        scores = {
            strmode: float_or_zero(evaluator.acc.scores()['lf'])
            for strmode, evaluator in evaluators.items()
        }
        if accuracy in ("both", "kbest"):
            scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"]
        if accuracy in ("both", "best"):
            scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"]
        if pos_accuracy:
            scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"]
        scores["coverage"] = 1 - (noparses / i)
        scores["time"] = end_time - start_time
        return (Result(
            scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'],
            "\t".join(f"{mode}" for mode in scores),
            "\t".join(f"{s}" for s in scores.values()),
            '\n\n'.join(evaluator.summary()
                        for evaluator in evaluators.values())),
                eval_loss / batches)
Exemple #31
0
def getfragments(trees, sents, numproc=1, disc=True,
		iterate=False, complement=False, indices=True, cover=True):
	"""Get recurring fragments with exact counts in a single treebank.

	:returns: a dictionary whose keys are fragments as strings, and
		indices as values. When ``disc`` is ``True``, keys are of the form
		``(frag, sent)`` where ``frag`` is a unicode string, and ``sent``
		is a list of words as unicode strings; when ``disc`` is ``False``, keys
		are of the form ``frag`` where ``frag`` is a unicode string.
	:param trees: a sequence of binarized Tree objects.
	:param numproc: number of processes to use; pass 0 to use detected # CPUs.
	:param disc: when disc=True, assume trees with discontinuous constituents.
	:param iterate, complement: see :func:`_fragments.extractfragments`"""
	if numproc == 0:
		numproc = cpu_count()
	numtrees = len(trees)
	if not numtrees:
		raise ValueError('no trees.')
	mult = 1  # 3 if numproc > 1 else 1
	fragments = {}
	trees = trees[:]
	work = workload(numtrees, mult, numproc)
	PARAMS.update(disc=disc, indices=indices, approx=False, complete=False,
			complement=complement, debug=False, adjacent=False, twoterms=False)
	initworkersimple(trees, list(sents), disc)
	if numproc == 1:
		mymap = map
		myapply = APPLY
	else:
		logging.info("work division:\n%s", "\n".join("    %s: %r" % kv
				for kv in sorted(dict(numchunks=len(work),
					numproc=numproc).items())))
		# start worker processes
		pool = Pool(processes=numproc, initializer=initworkersimple,
				initargs=(trees, list(sents), disc))
		mymap = pool.map
		myapply = pool.apply
	# collect recurring fragments
	logging.info("extracting recurring fragments")
	for a in mymap(worker, work):
		fragments.update(a)
	# add 'cover' fragments corresponding to single productions
	if cover:
		cover = myapply(coverfragworker, ())
		before = len(fragments)
		fragments.update(cover)
		logging.info("merged %d unseen cover fragments", len(fragments) - before)
	fragmentkeys = list(fragments)
	bitsets = [fragments[a] for a in fragmentkeys]
	countchunk = len(bitsets) // numproc + 1
	work = list(range(0, len(bitsets), countchunk))
	work = [(n, len(work), bitsets[a:a + countchunk])
			for n, a in enumerate(work)]
	logging.info("getting exact counts for %d fragments", len(bitsets))
	counts = []
	for a in mymap(exactcountworker, work):
		counts.extend(a)
	if numproc != 1:
		pool.close()
		pool.join()
		del pool
	if iterate:  # optionally collect fragments of fragments
		logging.info("extracting fragments of recurring fragments")
		PARAMS['complement'] = False  # needs to be turned off if it was on
		newfrags = fragments
		trees, sents = None, None
		ids = count()
		for _ in range(10):  # up to 10 iterations
			newtrees = [binarize(
					introducepreterminals(Tree.parse(tree, parse_leaf=int),
					ids=ids), childchar="}") for tree, _ in newfrags]
			newsents = [["#%d" % next(ids) if word is None else word
					for word in sent] for _, sent in newfrags]
			newfrags, newcounts = iteratefragments(
					fragments, newtrees, newsents, trees, sents, numproc)
			if len(newfrags) == 0:
				break
			if trees is None:
				trees = []
				sents = []
			trees.extend(newtrees)
			sents.extend(newsents)
			fragmentkeys.extend(newfrags)
			counts.extend(newcounts)
			fragments.update(zip(newfrags, newcounts))
	logging.info("found %d fragments", len(fragmentkeys))
	if not disc:
		return {a.decode('utf-8'): b for a, b in zip(fragmentkeys, counts)}
	return {(a.decode('utf-8'), b): c
			for (a, b), c in zip(fragmentkeys, counts)}
Exemple #32
0
def bitext():
	""" Bitext parsing with a synchronous CFG.
	Translation would require a special decoder (instead of normal kbest
	derivations where the whole sentence is given). """
	print("bitext parsing with a synchronous CFG")
	trees = [Tree.parse(a, parse_leaf=int) for a in """\
	(ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\
		(PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3)))
	(ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\
		(NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))""".split('\n')]
	sents = [["0"] * len(a.leaves()) for a in trees]
	for a in trees:
		treetransforms.binarize(a)
	compiled_scfg = Grammar(treebankgrammar(trees, sents))
	print("sentences:")
	for tree in trees:
		print(' '.join(w for _, w in sorted(tree.pos())))
	print("treebank:")
	for tree in trees:
		print(tree)
	print(compiled_scfg, "\n")

	print("correct translations:")
	assert parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | John aimes Mary".split())
	assert parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | la pizza manque a` John".split())

	print("incorrect translations:")
	assert not parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | Mary aimes John".split())
	assert not parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | John manque a` la pizza".split())

	# the following SCFG is taken from:
	# http://cdec-decoder.org/index.php?title=SCFG_translation
	# the grammar has been binarized and some new non-terminals had to be
	# introduced because terminals cannot appear in binary rules.
	lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand",
		"small", "little", "big", "large", "house", "shell", "a", "I",
		"saw", "found")
	another_scfg = Grammar([
			((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5),
			((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1),
			((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9),
			((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8),
			((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345),
			((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1),
			((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1),
			((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6),
			((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5),
			((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1),
			((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3),
			((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1),
			((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2),
			((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1),
			((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4),
			((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)]
			+ [((('_%s' % word, 'Epsilon'), (word, )), 1)
					for word in lexicon])
	print(another_scfg)
	sents = [
		"ich sah ein kleines Haus | I saw a small house".split(),
		"ich sah ein kleines Haus | I saw a little house".split(),
		"ich sah ein kleines Haus | I saw a small shell".split(),
		"ich sah ein kleines Haus | I saw a little shell".split()]
	for sent in sents:
		assert parse(another_scfg, sent), sent
Exemple #33
0
config.read(argv[1])

data = SupertagParseDataset(f"{config['Corpus']['filename']}.train")

from discodop.tree import ParentedTree, Tree
from discodop.treetransforms import unbinarize, removefanoutmarkers
from discodop.eval import Evaluator, readparam
from discodop.lexgrammar import SupertagGrammar

grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb"))
i = 0
evaluator = Evaluator(readparam("proper.prm"))
for sentence in data:
    words = tuple(t.text for t in sentence)
    poss = tuple(t.get_tag("pos").value for t in sentence)
    tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence)
    parses = grammar.parse(poss, tags, posmode=True)
    try:
        parse = next(parses)
    except StopIteration:
        leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words))))
        parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})")
    gold = ParentedTree(sentence.get_labels("tree")[0].value)
    gold = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(gold))))
    parse = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(parse))))
    evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True),
                  list(words))
    i += 1
print(evaluator.summary())
Exemple #34
0
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor,
		tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead,
		fanout_marks_before_bin, testmaxwords, resultdir, numproc,
		lexmodel, simplelexsmooth, top, relationalrealizational):
	""" Apply binarization and read off the requested grammars. """
	# fixme: this n should correspond to sentence id
	tbfanout, n = treebankfanout(trees)
	logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
			tbfanout, n, trees[n], ' '.join(sents[n]))
	# binarization
	begin = time.clock()
	if fanout_marks_before_bin:
		trees = [addfanoutmarkers(t) for t in trees]
	if bintype == 'binarize':
		bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov,
			'tailmarker' if tailmarker else '')
		for a in trees:
			binarize(a, factor=factor, tailmarker=tailmarker,
					horzmarkov=horzmarkov, vertmarkov=vertmarkov,
					leftmostunary=leftmostunary, rightmostunary=rightmostunary,
					reverse=revmarkov, pospa=pospa,
					headidx=-1 if markhead else None,
					filterfuncs=(relationalrealizational['ignorefunctions']
						+ (relationalrealizational['adjunctionlabel'], ))
						if relationalrealizational else ())
	elif bintype == 'optimal':
		trees = [Tree.convert(optimalbinarize(tree))
						for n, tree in enumerate(trees)]
	elif bintype == 'optimalhead':
		trees = [Tree.convert(optimalbinarize(tree, headdriven=True,
				h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)]
	trees = [addfanoutmarkers(t) for t in trees]
	logging.info('binarized %s cpu time elapsed: %gs',
						bintype, time.clock() - begin)
	logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees))
	trees = [canonicalize(a).freeze() for a in trees]

	for n, stage in enumerate(stages):
		if stage.split:
			traintrees = [binarize(splitdiscnodes(Tree.convert(a),
					stage.markorigin), childchar=':').freeze() for a in trees]
			logging.info('splitted discontinuous nodes')
		else:
			traintrees = trees
		if stage.mode.startswith('pcfg'):
			assert tbfanout == 1 or stage.split
		backtransform = None
		if stage.dop:
			if stage.usedoubledop:
				# find recurring fragments in treebank,
				# as well as depth 1 'cover' fragments
				fragments = getfragments(traintrees, sents, numproc,
						iterate=stage.iterate, complement=stage.complement)
				xgrammar, backtransform, altweights = doubledop(
						traintrees, fragments)
			else:  # DOP reduction
				xgrammar, altweights = dopreduction(
						traintrees, sents, packedgraph=stage.packedgraph)
			nodes = sum(len(list(a.subtrees())) for a in traintrees)
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
				for weights in altweights.values():
					weights.extend(w for _, w in newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			msg = grammarinfo(xgrammar)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			for name in altweights:
				grammar.register(u'%s' % name, altweights[name])
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			logging.info('DOP model based on %d sentences, %d nodes, '
				'%d nonterminals', len(traintrees), nodes, len(grammar.toid))
			logging.info(msg)
			if stage.estimator != 'dop1':
				grammar.switch(u'%s' % stage.estimator)
			_sumsto1 = grammar.testgrammar()
			if stage.usedoubledop:
				# backtransform keys are line numbers to rules file;
				# to see them together do:
				# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
				with codecs.getwriter('ascii')(gzip.open(
						'%s/%s.backtransform.gz' % (resultdir, stage.name),
						'w')) as out:
					out.writelines('%s\n' % a for a in backtransform)
				if n and stage.prune:
					msg = grammar.getmapping(stages[n - 1].grammar,
						striplabelre=None if stages[n - 1].dop
							else re.compile(b'@.+$'),
						neverblockre=re.compile(b'.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					msg = grammar.getmapping(None,
						striplabelre=None,
						neverblockre=re.compile(b'.+}<'),
						splitprune=False, markorigin=False)
				logging.info(msg)
			elif n and stage.prune:  # dop reduction
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None if stages[n - 1].dop
						and not stages[n - 1].usedoubledop
						else re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					grammar.getrulemapping(stages[n - 1].grammar)
				logging.info(msg)
			# write prob models
			np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name),
					**{name: mod for name, mod
						in zip(grammar.modelnames, grammar.models)})
		else:  # not stage.dop
			xgrammar = treebankgrammar(traintrees, sents)
			logging.info('induced %s based on %d sentences',
				('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
				len(traintrees))
			if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
				logging.info(grammarinfo(xgrammar))
			else:
				logging.info(grammarinfo(xgrammar,
						dump='%s/pcdist.txt' % resultdir))
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			_sumsto1 = grammar.testgrammar()
			if n and stage.prune:
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				logging.info(msg)
		logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir,
				stage.name, ',backtransform' if stage.usedoubledop else '')

		outside = None
		if stage.getestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			logging.info('computing PCFG estimates')
			begin = time.clock()
			outside = getpcfgestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
					time.clock() - begin)
			np.savez('pcfgoutside.npz', outside=outside)
			logging.info('saved PCFG estimates')
		elif stage.useestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			assert stage.mode != 'pcfg', (
					'estimates require agenda-based parser.')
			outside = np.load('pcfgoutside.npz')['outside']
			logging.info('loaded PCFG estimates')
		if stage.getestimates == 'SXlrgaps':
			logging.info('computing PLCFRS estimates')
			begin = time.clock()
			outside = getestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
						time.clock() - begin)
			np.savez('outside.npz', outside=outside)
			logging.info('saved estimates')
		elif stage.useestimates == 'SXlrgaps':
			outside = np.load('outside.npz')['outside']
			logging.info('loaded PLCFRS estimates')
		stage.update(grammar=grammar, backtransform=backtransform,
				outside=outside)
Exemple #35
0
def getfragments(trees, sents, numproc=1, iterate=False, complement=False):
	""" Get recurring fragments with exact counts in a single treebank.

	:returns: a dictionary whose keys are fragments as strings, and
		frequencies / indices as values.
	:param trees:  a sequence of binarized Tree objects. """
	if numproc == 0:
		numproc = cpu_count()
	numtrees = len(trees)
	assert numtrees
	mult = 1  # 3 if numproc > 1 else 1
	fragments = {}
	trees = trees[:]
	work = workload(numtrees, mult, numproc)
	PARAMS.update(disc=True, indices=True, approx=False, complete=False,
			quadratic=False, complement=complement)
	if numproc == 1:
		initworkersimple(trees, list(sents))
		mymap = map
		myapply = APPLY
	else:
		logging.info("work division:\n%s", "\n".join("    %s: %r" % kv
			for kv in sorted(dict(numchunks=len(work),
				numproc=numproc).items())))
		# start worker processes
		pool = Pool(processes=numproc, initializer=initworkersimple,
			initargs=(trees, list(sents)))
		mymap = pool.map
		myapply = pool.apply
	# collect recurring fragments
	logging.info("extracting recurring fragments")
	for a in mymap(worker, work):
		fragments.update(a)
	# add 'cover' fragments corresponding to single productions
	cover = myapply(coverfragworker, ())
	before = len(fragments)
	fragments.update(cover)
	logging.info("merged %d unseen cover fragments", len(fragments) - before)
	fragmentkeys = list(fragments)
	bitsets = [fragments[a] for a in fragmentkeys]
	countchunk = len(bitsets) // numproc + 1
	work = list(range(0, len(bitsets), countchunk))
	work = [(n, len(work), bitsets[a:a + countchunk])
			for n, a in enumerate(work)]
	logging.info("getting exact counts for %d fragments", len(bitsets))
	counts = []
	for a in mymap(exactcountworker, work):
		counts.extend(a)
	if numproc != 1:
		pool.close()
		pool.join()
		del pool
	if iterate:  # optionally collect fragments of fragments
		logging.info("extracting fragments of recurring fragments")
		PARAMS['complement'] = False  # needs to be turned off if it was on
		newfrags = fragments
		trees, sents = None, None
		ids = count()
		for _ in range(10):  # up to 10 iterations
			newtrees = [binarize(
					introducepreterminals(Tree.parse(tree, parse_leaf=int),
					ids=ids), childchar="}") for tree, _ in newfrags]
			newsents = [["#%d" % next(ids) if word is None else word
					for word in sent] for _, sent in newfrags]
			newfrags, newcounts = iteratefragments(
					fragments, newtrees, newsents, trees, sents, numproc)
			if len(newfrags) == 0:
				break
			if trees is None:
				trees = []
				sents = []
			trees.extend(newtrees)
			sents.extend(newsents)
			fragmentkeys.extend(newfrags)
			counts.extend(newcounts)
			fragments.update(zip(newfrags, newcounts))
	logging.info("found %d fragments", len(fragmentkeys))
	return dict(zip(fragmentkeys, counts))
Exemple #36
0
def main():
	"""Command line interface for applying tree(bank) transforms."""
	import io
	from getopt import gnu_getopt, GetoptError
	from discodop import treebanktransforms
	actions = {'none': None, 'introducepreterminals': introducepreterminals,
			'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None,
			'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None}
	flags = ('markorigin markheads leftunary rightunary tailmarker '
			'renumber reverse'.split())
	options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
			'punct= headrules= functions= morphology= lemmas= factor= '
			'markorigin= maxlen= fmt= enc= transforms=').split()
	try:
		opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options)
		if not 1 <= len(args) <= 3:
			raise GetoptError('error: expected 1, 2, or 3 positional arguments')
	except GetoptError as err:
		print('error: %r\n%s' % (err, USAGE), file=sys.stderr)
		sys.exit(2)
	opts, action = dict(opts), args[0]
	if action not in actions:
		print('unrecognized action: %r\navailable actions: %s' % (
				action, ', '.join(actions)), file=sys.stderr)
		sys.exit(2)
	if '--fmt' in opts:
		opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
	if '--enc' in opts:
		opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
	if opts.get('--outputfmt', WRITERS[0]) not in WRITERS:
		print('unrecognized output format: %r\navailable formats: %s' % (
				opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr)
		sys.exit(2)
	infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin'
	outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout'

	# open corpus
	corpus = READERS[opts.get('--inputfmt', 'export')](
			infilename,
			encoding=opts.get('--inputenc', 'utf-8'),
			headrules=opts.get('--headrules'), markheads='--markheads' in opts,
			ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'),
			functions=opts.get('--functions'),
			morphology=opts.get('--morphology'),
			lemmas=opts.get('--lemmas'))
	start, end = opts.get('--slice', ':').split(':')
	start, end = (int(start) if start else None), (int(end) if end else None)
	trees = corpus.itertrees(start, end)
	if '--maxlen' in opts:
		maxlen = int(opts['--maxlen'])
		trees = ((key, (tree, sent)) for key, (tree, sent) in trees
				if len(sent) <= maxlen)
	if '--renumber' in opts:
		trees = (('%8d' % n, treesent)
				for n, (_, treesent) in enumerate(trees, 1))

	# select transformation
	transform = actions[action]
	if action in ('binarize', 'optimalbinarize'):
		h = int(opts.get('-h', 999))
		v = int(opts.get('-v', 1))
		if action == 'binarize':
			factor = opts.get('--factor', 'right')
			transform = lambda t, _: binarize(t, factor, h, v,
					leftmostunary='--leftunary' in opts,
					rightmostunary='--rightunary' in opts,
					tailmarker='$' if '--tailmarker' in opts else '')
		elif action == 'optimalbinarize':
			headdriven = '--headrules' in opts
			transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v)
	elif action == 'splitdisc':
		transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts)
	elif action == 'unbinarize':
		transform = lambda t, _: unbinarize(Tree.convert(t))
	elif action == 'transform':
		tfs = opts['--transforms'].split(',')
		transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs)
				if '--reverse' in opts
				else treebanktransforms.transform(t, s, tfs))
	if transform is not None:  # NB: transform cannot affect (no. of) terminals
		trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees)

	# read, transform, & write trees
	headrules = None
	if opts.get('--outputfmt') in ('mst', 'conll'):
		if not opts.get('--headrules'):
			raise ValueError('need head rules for dependency conversion')
		headrules = treebanktransforms.readheadrules(opts.get('--headrules'))
	cnt = 0
	if opts.get('--outputfmt') == 'dact':
		import alpinocorpus
		outfile = alpinocorpus.CorpusWriter(outfilename)
		if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact')
				and set(opts) <= {'--slice', '--inputfmt', '--outputfmt',
				'--renumber'}):
			for n, (key, block) in islice(enumerate(
					corpus.blocks().items(), 1), start, end):
				outfile.write('%8d' % n if '--renumber' in opts else key, block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(str(key), writetree(tree, sent, key, 'alpino'))
				cnt += 1
	else:
		encoding = opts.get('outputenc', 'utf-8')
		outfile = io.open(outfilename, 'w', encoding=encoding)
		# copy trees verbatim when only taking slice or converting encoding
		if (action == 'none' and opts.get('--inputfmt') == opts.get(
				'--outputfmt') and set(opts) <= {'--slice', '--inputenc',
				'--outputenc', '--inputfmt', '--outputfmt'}):
			for block in islice(corpus.blocks().values(), start, end):
				outfile.write(block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(writetree(tree, sent, key,
						opts.get('--outputfmt', 'export'), headrules))
				cnt += 1
	print('%sed %d trees with action %r' % ('convert' if action == 'none'
			else 'transform', cnt, action), file=sys.stderr)
Exemple #37
0
def test():
	"""Do some tests."""
	trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \
				(PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10))
			(S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2)))
			(S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1))
			(top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \
				(adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \
				(pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \
				(noun 14) (noun 15))))))))) (punct 16))
			(top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \
				(adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9))
			(top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \
				(punct 5))
			(top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \
				(noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \
				(det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \
				(sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \
				(det 17)) (noun 18)) (verb 20)))))) (punct 21))
			(top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \
				(pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \
				(noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \
				(prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \
				(adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \
				(ppres (adj 19)) (ppres (adj 21)))) (punct 26))
			(top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \
				(noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \
				(comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \
				(smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \
				(punct 18) (punct 19))
			(top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \
				(adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \
				(noun 7))) (verb 10))))) (punct 11))
			(top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \
				(noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \
				(adj 8) (part 9) (verb 11))))))) (punct 12))
			(top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \
				(cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \
				(noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \
				(adj 15) (noun 16))))))))) (punct 17))
			(top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \
				(np (det 5) (noun 6)) (part 7))) (punct 8))
			(top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \
				(noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \
				(verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \
				(smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \
				(noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \
				(punct 22))
			(top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \
				(noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \
				(prep 9) (noun 10)))) (punct 11))
			(top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \
				(vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \
				(noun 9))))) (punct 10))
			(top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \
				(verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \
				(num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \
				(prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \
				(noun 19))))))))))) (punct 20))
			(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \
				(inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \
				(verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \
				(punct 12))
			(top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \
				(noun 1)) (verb 5))) (punct 6))
			(top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \
				(prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \
				(noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \
				(pp (prep 14) (np (det 15) (noun 16))))) (punct 17))
			(top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \
				(noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \
				(inf (ppart (np (det 8) (noun 9)))))) (punct 10))
			(A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10))  (B3 (t 1) \
				(t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8)))
			(A (B1 6 13) (B2 3 7 10)  (B3 1 \
				9 11 14 16) (B4 0 5 8))
			(VP (VB 0) (PRT 2))
			(VP (VP 0 3) (NP (PRP 1) (NN 2)))
			(ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \
				(ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \
				(VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))'''
	sents = '''Leider stehen diese Fragen nicht im Vordergrund der \
				augenblicklichen Diskussion .
			is Mary happy there
			das muss man jetzt machen
			Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \
				buurt van Trafalgar Square .
			Het had een prachtige dag kunnen zijn in Londen .
			Cathy zag hen wild zwaaien .
			Het was een spel geworden , zij en haar vriendinnen kozen iemand \
				uit en probeerden zijn of haar nationaliteit te raden .
			Elk jaar in het hoogseizoen trokken daar massa's toeristen \
				voorbij , hun fototoestel in de aanslag , pratend , gillend \
				en lachend in de vreemdste talen .
			Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \
				komt wel goed , joch " .
			Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \
				liggen .
			Het hoorde bij de warme zomerdag die ze ginds achter had gelaten .
			De oprijlaan was niet meer dan een hobbelige zandstrook die zich \
				voortslingerde tussen de hoge grijze boomstammen .
			Haar moeder kleefde bijna tegen het autoraampje aan .
			Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \
				op en begaf zich in de richting van het landhuis .
			Het meisje dat vijf keer juist raadde werd getrakteerd op ijs .
			Haar neus werd platgedrukt en leek op een jonge champignon .
			Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \
				een zilveren schijnsel tussen de bomen en struiken .
			Ze had met haar moeder kunnen gaan winkelen , zwemmen of \
				terrassen .
			Dat werkwoord had ze zelf uitgevonden .
			De middagzon hing klein tussen de takken en de schaduwen van de \
				wolken drentelden over het gras .
			Zij zou mams rug ingewreven hebben en mam de hare .
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \
				offiziell zu Ende gegangen .'''
	trees = [Tree.parse(a, parse_leaf=int) for a in trees.splitlines()]
	sents = [a.split() for a in sents.splitlines()]
	sents.extend([['Wake', None, 'up'],
		[None, 'your', 'friend', None]])
	for n, (tree, sent) in enumerate(zip(trees, sents)):
		drawtree = DrawTree(tree, sent)
		print('\ntree, sent', n, tree,
				' '.join('...' if a is None else a for a in sent),
				repr(drawtree),
				sep='\n')
		try:
			print(drawtree.text(unicodelines=True, ansi=True), sep='\n')
		except (UnicodeDecodeError, UnicodeEncodeError):
			print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
Exemple #38
0
def parse():
    """Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
    sent = request.args.get('sent', None)
    est = request.args.get('est', 'rfe')
    marg = request.args.get('marg', 'nbest')
    objfun = request.args.get('objfun', 'mpp')
    coarse = request.args.get('coarse', None)
    html = 'html' in request.args
    lang = request.args.get('lang', 'detect')
    if not sent:
        return ''
    frags = nbest = None
    senttok = tokenize(sent)
    if not senttok or not 1 <= len(senttok) <= LIMIT:
        return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
    if lang == 'detect':
        lang = guesslang(senttok)
    elif lang not in PARSERS:
        return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
    key = (senttok, est, marg, objfun, coarse, lang)
    resp = CACHE.get(key)
    if resp is None:
        link = 'parse?' + url_encode(
            dict(sent=sent,
                 est=est,
                 marg=marg,
                 objfun=objfun,
                 coarse=coarse,
                 html=html))
        PARSERS[lang].stages[-1].estimator = est
        PARSERS[lang].stages[-1].objective = objfun
        PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
        PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
        if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
            PARSERS[lang].stages[0].mode = coarse
            PARSERS[lang].stages[1].k = (1e-5
                                         if coarse == 'pcfg-posterior' else 50)

        results = list(PARSERS[lang].parse(senttok))
        if results[-1].noparse:
            parsetrees = []
            result = 'no parse!'
            frags = nbest = ''
        else:
            if SHOWMORPH:
                replacemorph(results[-1].parsetree)
            if SHOWFUNC:
                treebank.handlefunctions('add',
                                         results[-1].parsetree,
                                         pos=True)
            tree = str(results[-1].parsetree)
            prob = results[-1].prob
            parsetrees = results[-1].parsetrees or []
            parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
            parsetrees_ = []
            fragments = results[-1].fragments or ()
            APP.logger.info('[%s] %s', probstr(prob), tree)
            tree = Tree.parse(tree, parse_leaf=int)
            result = Markup(
                DrawTree(tree, senttok).text(unicodelines=True,
                                             html=html,
                                             funcsep='-'))
            frags = Markup(
                'Phrasal fragments used in the most probable '
                'derivation of the highest ranked parse tree:\n' + '\n\n'.join(
                    DrawTree(frag).text(unicodelines=True, html=html)
                    for frag in fragments if frag.count('(') > 1))
            for tree, prob, x in parsetrees:
                tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
                if SHOWMORPH:
                    replacemorph(tree)
                if SHOWFUNC:
                    treebank.handlefunctions('add', tree, pos=True)
                parsetrees_.append((tree, prob, x))
            nbest = Markup('\n\n'.join(
                '%d. [%s]\n%s' %
                (n + 1, probstr(prob), DrawTree(tree, senttok).text(
                    unicodelines=True, html=html, funcsep='-'))
                for n, (tree, prob, _) in enumerate(parsetrees_)))
        msg = '\n'.join(stage.msg for stage in results)
        elapsed = [stage.elapsedtime for stage in results]
        elapsed = 'CPU time elapsed: %s => %gs' % (' '.join(
            '%gs' % a for a in elapsed), sum(elapsed))
        info = '\n'.join(
            ('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' %
             (len(senttok), lang, est, objfun, marg), msg, elapsed,
             '10 most probable parse trees:', '\n'.join(
                 '%d. [%s] %s' %
                 (n + 1, probstr(prob), writediscbrackettree(tree, senttok))
                 for n, (tree, prob, _) in enumerate(parsetrees)) + '\n'))
        CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000)
    else:
        (
            sent,
            result,
            frags,
            nbest,  # pylint: disable=unpacking-non-sequence
            info,
            link) = resp  # pylint: disable=unpacking-non-sequence
    if html:
        return render_template('parsetree.html',
                               sent=sent,
                               result=result,
                               frags=frags,
                               nbest=nbest,
                               info=info,
                               link=link,
                               randid=randid())
    else:
        return Response('\n'.join((nbest, frags, info, result)),
                        mimetype='text/plain')
Exemple #39
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
    from discodop.disambiguation import recoverfragments
    from discodop.kbest import lazykbest
    from math import exp
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, backtransform, _, _ = doubledop(trees,
                                              sents,
                                              debug=False,
                                              numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(grammar,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    assert grammar.testgrammar()[0], "RFE should sum to 1."
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print("sentence:",
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            mpp, parsetrees = {}, {}
            derivations, _ = lazykbest(chart, 1000, '}<')
            for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
                r = Tree(recoverfragments(d.key, chart, backtransform))
                r = str(removefanoutmarkers(unbinarize(r)))
                mpp[r] = mpp.get(r, 0.0) + exp(-p)
                parsetrees.setdefault(r, []).append((t, p))
            if debug:
                print(len(mpp), 'parsetrees',
                      sum(map(len, parsetrees.values())), 'derivations')
            for t, tp in sorted(mpp.items(), key=itemgetter(1)):
                if debug:
                    print(tp, t, '\nmatch:', t == str(tree))
                if len(set(parsetrees[t])) != len(parsetrees[t]):
                    print('chart:\n', chart)
                    assert len(set(parsetrees[t])) == len(parsetrees[t])
                if debug:
                    for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
                        print(' <= %6g %s' % (exp(-p), deriv))
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemple #40
0
def parse():
	"""Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
	sent = request.args.get('sent', None)
	objfun = request.args.get('objfun', 'mpp')
	est = request.args.get('est', 'rfe')
	marg = request.args.get('marg', 'nbest')
	coarse = request.args.get('coarse', 'pcfg')
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	require = request.args.get('require', None)
	block = request.args.get('block', None)
	if not sent:
		return ''
	nbest = None
	if POSTAGS.match(sent):
		senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split()))
	else:
		senttok, tags = tuple(tokenize(sent)), None
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	if require:
		require = tuple((label, tuple(indices))
				for label, indices in sorted(json.loads(require)))
	if block:
		block = tuple((label, tuple(indices))
				for label, indices in sorted(json.loads(block)))
	key = (senttok, tags, est, marg, objfun, coarse, lang, require, block)
	resp = CACHE.get(key)
	if resp is None:
		urlparams = dict(sent=sent, est=est, marg=marg, objfun=objfun,
				coarse=coarse, html=html)
		if require:
			urlparams['require'] = json.dumps(require)
		if block:
			urlparams['block'] = json.dumps(block)
		link = 'parse?' + url_encode(urlparams)
		PARSERS[lang].stages[-1].estimator = est
		PARSERS[lang].stages[-1].objective = objfun
		PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
		PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
		if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
			PARSERS[lang].stages[0].mode = (
					'pcfg' if coarse == 'pcfg-posterior' else coarse)
			if len(PARSERS[lang].stages) > 1:
				PARSERS[lang].stages[1].k = (1e-5
						if coarse == 'pcfg-posterior' else 50)
		results = list(PARSERS[lang].parse(
				senttok, tags=tags, require=require, block=block))
		if results[-1].noparse:
			parsetrees = []
			result = 'no parse!'
			nbest = dep = depsvg = ''
		else:
			if SHOWMORPH:
				replacemorph(results[-1].parsetree)
			if SHOWFUNC:
				treebank.handlefunctions('add', results[-1].parsetree, pos=True)
			tree = str(results[-1].parsetree)
			prob = results[-1].prob
			parsetrees = results[-1].parsetrees or []
			parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
			parsetrees_ = []
			APP.logger.info('[%s] %s', probstr(prob), tree)
			tree = Tree.parse(tree, parse_leaf=int)
			result = Markup(DrawTree(tree, senttok).text(
					unicodelines=True, html=html, funcsep='-'))
			for tree, prob, x in parsetrees:
				tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
				if SHOWMORPH:
					replacemorph(tree)
				if SHOWFUNC:
					treebank.handlefunctions('add', tree, pos=True)
				parsetrees_.append((tree, prob, x))
			if PARSERS[lang].headrules:
				xtree = PARSERS[lang].postprocess(
						parsetrees[0][0], senttok, -1)[0]
				dep = treebank.writedependencies(xtree, senttok, 'conll')
				depsvg = Markup(DrawDependencies.fromconll(dep).svg())
			else:
				dep = depsvg = ''
			rid = randid()
			nbest = Markup('\n\n'.join('%d. [%s] '
					'<a href=\'javascript: toggle("f%s%d"); \'>'
					'derivation</a>\n'
					'<span id=f%s%d style="display: none; margin-left: 3em; ">'
					'Fragments used in the highest ranked derivation'
					' of this parse tree:\n%s</span>\n%s' % (
						n + 1,
						probstr(prob),
						rid, n + 1,
						rid, n + 1,
						'\n\n'.join('%s\n%s' % (w,
							DrawTree(frag).text(unicodelines=True, html=html))
							for frag, w in fragments or ()  # if frag.count('(') > 1
						),
						DrawTree(tree, senttok).text(
							unicodelines=True, html=html, funcsep='-'))
					for n, (tree, prob, fragments) in enumerate(parsetrees_)))
		msg = '\n'.join(stage.msg for stage in results)
		elapsed = [stage.elapsedtime for stage in results]
		elapsed = 'CPU time elapsed: %s => %gs' % (
				' '.join('%gs' % a for a in elapsed), sum(elapsed))
		info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
				len(senttok), lang, est, objfun, marg), msg, elapsed,
				'10 most probable parse trees:',
				''.join('%d. [%s] %s' % (n + 1, probstr(prob),
						writediscbrackettree(tree, senttok))
						for n, (tree, prob, _) in enumerate(parsetrees))
				+ '\n'))
		CACHE.set(key, (sent, result, nbest, info, link, dep, depsvg),
				timeout=5000)
	else:
		(sent, result, nbest, info, link, dep, depsvg) = resp
	if html:
		return render_template('parsetree.html', sent=sent, result=result,
				nbest=nbest, info=info, link=link, dep=dep,
				depsvg=depsvg, randid=randid())
	else:
		return Response('\n'.join((nbest, info, result)),
				mimetype='text/plain')
Exemple #41
0
def parse():
	""" Parse sentence and return a textual representation of a parse tree,
	in a HTML fragment or plain text. To be invoked by an AJAX call."""
	sent = request.args.get('sent', None)
	est = request.args.get('est', 'dop1')
	marg = request.args.get('marg', 'nbest')
	objfun = request.args.get('objfun', 'mpp')
	coarse = request.args.get('coarse', None)
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	if not sent:
		return ''
	frags = nbest = None
	senttok = tokenize(sent)
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	key = (senttok, est, marg, objfun, coarse, lang, html)
	if CACHE.get(key) is not None:
		return CACHE.get(key)
	link = url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun,
			coarse=coarse, html=html))
	PARSERS[lang].stages[-1].estimator = est
	PARSERS[lang].stages[-1].objective = objfun
	PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
	PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
	if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
		PARSERS[lang].stages[0].mode = coarse
		PARSERS[lang].stages[1].k = 1e-5 if coarse == 'pcfg-posterior' else 50

	results = list(PARSERS[lang].parse(senttok))
	if results[-1].noparse:
		parsetrees = {}
		result = 'no parse!'
		frags = nbest = ''
	else:
		if PARSERS[lang].relationalrealizational:
			treebank.handlefunctions('add', results[-1].parsetree, pos=True)
		tree = str(results[-1].parsetree)
		prob = results[-1].prob
		parsetrees = results[-1].parsetrees or {}
		parsetrees = heapq.nlargest(10, parsetrees.items(), key=itemgetter(1))
		fragments = results[-1].fragments or ()
		APP.logger.info('[%s] %s' % (probstr(prob), tree))
		tree = Tree.parse(tree, parse_leaf=int)
		result = Markup(DrawTree(tree, senttok, abbr=True).text(
				unicodelines=True, html=html))
		frags = Markup('Phrasal fragments used in the most probable derivation'
				' of the highest ranked parse tree:\n'
				+ '\n\n'.join(
				DrawTree(Tree.parse(frag, parse_leaf=int), terminals).text(
						unicodelines=True, html=html)
				for frag, terminals in fragments))
		nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob),
					DrawTree(PARSERS[lang].postprocess(tree)[0], senttok,
						abbr=True).text(unicodelines=True, html=html))
				for n, (tree, prob) in enumerate(parsetrees)))
	msg = '\n'.join(stage.msg for stage in results)
	elapsed = [stage.elapsedtime for stage in results]
	elapsed = 'CPU time elapsed: %s => %gs' % (
			' '.join('%gs' % a for a in elapsed), sum(elapsed))
	info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
			len(senttok), lang, est, objfun, marg), msg, elapsed,
			'10 most probable parse trees:',
			'\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), tree)
					for n, (tree, prob) in enumerate(parsetrees)) + '\n'))
	if html:
		CACHE.set(key, render_template('parsetree.html', sent=sent,
				result=result, frags=frags, nbest=nbest, info=info, link=link,
				randid=randid()), timeout=5000)
	else:
		CACHE.set(key, Response('\n'.join((nbest, frags, info, result)),
				mimetype='text/plain'), timeout=5000)
	return CACHE.get(key)
Exemple #42
0
def trees(form):
	""" Return visualization of parse trees in search results. """
	# TODO: show context of x sentences around result, offer pagination.
	gotresults = False
	for n, (_textno, results, stderr) in enumerate(
			doqueries(form, lines=True)):
		if n == 0:
			# NB: we do not hide function or morphology tags when exporting
			url = 'trees?query=%s&texts=%s&engine=%s&export=1' % (
					quote(form['query']), form['texts'],
					form.get('engine', 'tgrep2'))
			yield ('Query: %s\n'
					'Trees (showing up to %d per text; '
					'export: <a href="%s">plain</a>, '
					'<a href="%s">with line numbers</a>):\n' % (
						stderr, TREELIMIT, url, url + '&linenos=1'))
		for m, line in enumerate(islice(results, TREELIMIT)):
			lineno, text, treestr, match = line.split(":::")
			if m == 0:
				gotresults = True
				yield ("==&gt; %s: [<a href=\"javascript: toggle('n%d'); \">"
						"toggle</a>]\n<span id=n%d>" % (text, n + 1, n + 1))
			if form.get('engine', 'tgrep2') == 'tgrep2':
				cnt = count()
				treestr = treestr.replace(" )", " -NONE-)")
				match = match.strip()
				if match.startswith('('):
					treestr = treestr.replace(match, '%s_HIGH %s' % tuple(
							match.split(None, 1)))
				else:
					match = ' %s)' % match
					treestr = treestr.replace(match, '_HIGH%s' % match)
				tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt))
				sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr)
				high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH")))
				if high:
					high = high.pop()
					high.label = high.label.rsplit("_", 1)[0]
					high = list(high.subtrees()) + high.leaves()
			elif form.get('engine', 'tgrep2') == 'xpath':
				tree, sent = treebank.alpinotree(
						ElementTree.fromstring(treestr))
						# morphology='replace')
				highwords = re.findall('<node[^>]*begin="([0-9]+)"[^>]*/>',
						match)
				high = set(re.findall(r'\bid="(.+?)"', match))
				high = list(tree.subtrees(lambda n:
						n.source[treebank.PARENT] in high or
						n.source[treebank.WORD].lstrip('#') in high))
				high += [int(a) for a in highwords]
			try:
				treerepr = DrawTree(tree, sent, highlight=high).text(
						unicodelines=True, html=True)
			except ValueError as err:
				line = "#%s \nERROR: %s\n%s\n%s\n" % (
						lineno, err, treestr, tree)
			else:
				line = "#%s\n%s\n" % (lineno, treerepr)
			yield line
		yield "</span>"
	if not gotresults:
		yield "No matches."