Beispiel #1
0
 def test_popitem_ties(self):
     from discodop.plcfrs import Agenda
     h = Agenda()
     for i in range(TestHeap.testN):
         h[i] = 0.
     for i in range(TestHeap.testN):
         _, v = h.popitem()
         self.assertEqual(v, 0.)
         self.check_invariants(h)
Beispiel #2
0
	def test_popitem_ties(self):
		from discodop.plcfrs import Agenda
		h = Agenda()
		for i in range(TestHeap.testN):
			h[i] = 0.
		for i in range(TestHeap.testN):
			_, v = h.popitem()
			self.assertEqual(v, 0.)
			self.check_invariants(h)
Beispiel #3
0
 def test_init_small(self):
     from discodop.plcfrs import Agenda
     for data in ([(0, 3), (1, 7), (2, 1)], [(0, 7), (1, 3),
                                             (2, 1)], [(0, 7), (1, 3),
                                                       (2, 7)]):
         h = Agenda(data)
         self.assertEqual(
             [h.popitem(), h.popitem(),
              h.popitem()], sorted(data, key=itemgetter(1)))
         self.assertEqual(len(h), 0)
Beispiel #4
0
 def test_init(self):
     from discodop.plcfrs import Agenda
     h, pairs, d = self.make_data()
     h = Agenda(d.items())
     while pairs:
         v = h.popitem()
         v2 = pairs.pop()
         self.assertEqual(v, v2)
         d.pop(v[0])
     self.assertEqual(len(h), len(d))
     self.assertEqual(len(h), 0)
Beispiel #5
0
	def test_init_small(self):
		from discodop.plcfrs import Agenda
		for data in (
				[(0, 3), (1, 7), (2, 1)],
				[(0, 7), (1, 3), (2, 1)],
				[(0, 7), (1, 3), (2, 7)]):
			h = Agenda(data)
			self.assertEqual(
					[h.popitem(), h.popitem(), h.popitem()],
					sorted(data, key=itemgetter(1)))
			self.assertEqual(len(h), 0)
Beispiel #6
0
	def test_init(self):
		from discodop.plcfrs import Agenda
		h, pairs, d = self.make_data()
		h = Agenda(d.items())
		while pairs:
			v = h.popitem()
			v2 = pairs.pop()
			self.assertEqual(v, v2)
			d.pop(v[0])
		self.assertEqual(len(h), len(d))
		self.assertEqual(len(h), 0)
Beispiel #7
0
    def make_data(self):
        from random import random
        from discodop.plcfrs import Agenda
        pairs = [(random(), random()) for _ in range(TestHeap.testN)]
        h = Agenda()
        d = {}
        for k, v in pairs:
            h[k] = v
            d[k] = v

        pairs.sort(key=itemgetter(1), reverse=True)
        return h, pairs, d
Beispiel #8
0
def minimalbinarization(tree, score, sep='|', head=None, parentstr='', h=999):
	"""Find optimal binarization according to a scoring function.

	Implementation of Gildea (2010): Optimal parsing strategies for linear
	context-free rewriting systems.

	:param tree: ImmutableTree for which the optimal binarization of its top
		production will be searched. Nodes need to have a .bitset attribute,
		as produced by ``addbitsets()``.
	:param score: a function from binarized trees to scores, where lower is
		better (the scores can be anything else which supports comparisons).
	:param head: an optional index of the head node, specifying it enables
		head-driven binarization (which constrains the possible binarizations).

	>>> tree = '(X (A 0) (B 1) (C 2) (D 3) (E 4))'
	>>> tree2 = binarize(Tree.parse(tree, parse_leaf=int))
	>>> minimalbinarization(addbitsets(tree), complexityfanout, head=2) == tree2
	True
	>>> tree = addbitsets('(A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) '
	... '(B3 (t 1) (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8)))')
	>>> a = minimalbinarization(tree, complexityfanout)
	>>> b = minimalbinarization(tree, fanoutcomplexity)
	>>> print(max(map(complexityfanout, a.subtrees())))
	(14, 6)
	>>> print(max(map(complexityfanout, b.subtrees())))
	(15, 5)"""
	def newproduction(a, b):
		"""Return a new 'production' (here a tree) combining a and b."""
		if head is not None:
			siblings = (nonterms[a] | nonterms[b])[:h]
		else:
			siblings = getbits(nonterms[a] | nonterms[b])
		newlabel = '%s%s<%s>%s' % (tree.label, sep,
				','.join(labels[x] for x in siblings), parentstr)
		new = ImmutableTree(newlabel, [a, b])
		new.bitset = a.bitset | b.bitset
		return new
	if len(tree) <= 2:
		return tree
	# don't bother with optimality if this particular node is not discontinuous
	# do default right factored binarization instead
	elif fanout(tree) == 1 and all(fanout(a) == 1 for a in tree):
		return factorconstituent(tree, sep=sep, h=h)
	from discodop.plcfrs import Agenda
	labels = [a.label for a in tree]
	# the four main datastructures:
	# the agenda is a priority queue of partial binarizations to explore
	# the first complete binarization that is dequeued is the optimal one
	agenda = Agenda()
	# the working set contains all the optimal partial binarizations
	# keys are binarizations, values are their scores
	workingset = {}
	# for each of the optimal partial binarizations, this dictionary has
	# a bitset that describes which non-terminals from the input it covers
	nonterms = {}
	# reverse lookup table for nonterms (from bitsets to binarizations)
	revnonterms = {}
	# the goal is a bitset that covers all non-terminals of the input
	goal = (1 << len(tree)) - 1
	if head is None:
		for n, a in enumerate(tree):
			nonterms[a] = 1 << n
			revnonterms[nonterms[a]] = a
			workingset[a] = score(a) + (0,)
			agenda[a] = workingset[a]
	else:
		# head driven binarization:
		# add all non-head nodes to the working set,
		# add all combinations of non-head nodes with head to agenda
		# caveat: Crescenzi et al. (2011) show that this problem is NP hard.
		hd = tree[head]
		goal = OrderedSet(range(len(tree)))
		for n, a in enumerate(tree):
			nonterms[a] = OrderedSet([n])
			revnonterms[nonterms[a]] = a
			if n != head:
				workingset[a] = score(a) + (0,)
		for n, a in enumerate(tree):
			if n == head:
				continue
			# (add initial unary here)
			p = newproduction(a, hd)
			x = score(p)
			agenda[p] = workingset[p] = x + (x[0],)
			nonterms[p] = nonterms[a] | nonterms[hd]
			revnonterms[nonterms[p]] = p
	while agenda:
		p, x = agenda.popitem()
		if nonterms[p] == goal:
			# (add final unary here)
			p = ImmutableTree(tree.label, p[:])
			p.bitset = tree.bitset
			return p
		for p1, y in list(workingset.items()):
			if p1 not in workingset:
				continue
			# this is inefficient. we should have a single query for all
			# items not overlapping with p
			elif nonterms[p] & nonterms[p1]:
				continue
			# if we do head-driven binarization, add one nonterminal at a time
			if head is None:
				p2 = newproduction(p, p1)
				p2nonterms = nonterms[p] | nonterms[p1]
			elif len(nonterms[p1]) == 1:
				p2 = newproduction(p1, p)
				p2nonterms = nonterms[p1] | nonterms[p]
			elif len(nonterms[p]) == 1:
				p2 = newproduction(p, p1)
				p2nonterms = nonterms[p] | nonterms[p1]
			else:
				continue
			scorep2 = score(p2)
			# important: the score is the maximum score up till now
			x2 = max((scorep2, y[:-1], x[:-1]))
			# add the sum of all previous parsing complexities as last item
			x2 += (scorep2[0] + x[-1] + y[-1],)
			# if new or better:
			# should we allow item when score is equal?
			if (p2nonterms not in revnonterms
				or workingset[revnonterms[p2nonterms]] > x2):
				if p2nonterms in revnonterms:
					a = revnonterms[p2nonterms]
					del nonterms[a], workingset[a]
					if a in agenda:
						del agenda[a]
				nonterms[p2] = p2nonterms
				revnonterms[p2nonterms] = p2
				agenda[p2] = workingset[p2] = x2
	raise ValueError('agenda exhausted without finding binarization.')