def test_popitem_ties(self): from discodop.plcfrs import Agenda h = Agenda() for i in range(TestHeap.testN): h[i] = 0. for i in range(TestHeap.testN): _, v = h.popitem() self.assertEqual(v, 0.) self.check_invariants(h)
def test_init_small(self): from discodop.plcfrs import Agenda for data in ([(0, 3), (1, 7), (2, 1)], [(0, 7), (1, 3), (2, 1)], [(0, 7), (1, 3), (2, 7)]): h = Agenda(data) self.assertEqual( [h.popitem(), h.popitem(), h.popitem()], sorted(data, key=itemgetter(1))) self.assertEqual(len(h), 0)
def test_init(self): from discodop.plcfrs import Agenda h, pairs, d = self.make_data() h = Agenda(d.items()) while pairs: v = h.popitem() v2 = pairs.pop() self.assertEqual(v, v2) d.pop(v[0]) self.assertEqual(len(h), len(d)) self.assertEqual(len(h), 0)
def test_init_small(self): from discodop.plcfrs import Agenda for data in ( [(0, 3), (1, 7), (2, 1)], [(0, 7), (1, 3), (2, 1)], [(0, 7), (1, 3), (2, 7)]): h = Agenda(data) self.assertEqual( [h.popitem(), h.popitem(), h.popitem()], sorted(data, key=itemgetter(1))) self.assertEqual(len(h), 0)
def make_data(self): from random import random from discodop.plcfrs import Agenda pairs = [(random(), random()) for _ in range(TestHeap.testN)] h = Agenda() d = {} for k, v in pairs: h[k] = v d[k] = v pairs.sort(key=itemgetter(1), reverse=True) return h, pairs, d
def minimalbinarization(tree, score, sep='|', head=None, parentstr='', h=999): """Find optimal binarization according to a scoring function. Implementation of Gildea (2010): Optimal parsing strategies for linear context-free rewriting systems. :param tree: ImmutableTree for which the optimal binarization of its top production will be searched. Nodes need to have a .bitset attribute, as produced by ``addbitsets()``. :param score: a function from binarized trees to scores, where lower is better (the scores can be anything else which supports comparisons). :param head: an optional index of the head node, specifying it enables head-driven binarization (which constrains the possible binarizations). >>> tree = '(X (A 0) (B 1) (C 2) (D 3) (E 4))' >>> tree2 = binarize(Tree.parse(tree, parse_leaf=int)) >>> minimalbinarization(addbitsets(tree), complexityfanout, head=2) == tree2 True >>> tree = addbitsets('(A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) ' ... '(B3 (t 1) (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8)))') >>> a = minimalbinarization(tree, complexityfanout) >>> b = minimalbinarization(tree, fanoutcomplexity) >>> print(max(map(complexityfanout, a.subtrees()))) (14, 6) >>> print(max(map(complexityfanout, b.subtrees()))) (15, 5)""" def newproduction(a, b): """Return a new 'production' (here a tree) combining a and b.""" if head is not None: siblings = (nonterms[a] | nonterms[b])[:h] else: siblings = getbits(nonterms[a] | nonterms[b]) newlabel = '%s%s<%s>%s' % (tree.label, sep, ','.join(labels[x] for x in siblings), parentstr) new = ImmutableTree(newlabel, [a, b]) new.bitset = a.bitset | b.bitset return new if len(tree) <= 2: return tree # don't bother with optimality if this particular node is not discontinuous # do default right factored binarization instead elif fanout(tree) == 1 and all(fanout(a) == 1 for a in tree): return factorconstituent(tree, sep=sep, h=h) from discodop.plcfrs import Agenda labels = [a.label for a in tree] # the four main datastructures: # the agenda is a priority queue of partial binarizations to explore # the first complete binarization that is dequeued is the optimal one agenda = Agenda() # the working set contains all the optimal partial binarizations # keys are binarizations, values are their scores workingset = {} # for each of the optimal partial binarizations, this dictionary has # a bitset that describes which non-terminals from the input it covers nonterms = {} # reverse lookup table for nonterms (from bitsets to binarizations) revnonterms = {} # the goal is a bitset that covers all non-terminals of the input goal = (1 << len(tree)) - 1 if head is None: for n, a in enumerate(tree): nonterms[a] = 1 << n revnonterms[nonterms[a]] = a workingset[a] = score(a) + (0,) agenda[a] = workingset[a] else: # head driven binarization: # add all non-head nodes to the working set, # add all combinations of non-head nodes with head to agenda # caveat: Crescenzi et al. (2011) show that this problem is NP hard. hd = tree[head] goal = OrderedSet(range(len(tree))) for n, a in enumerate(tree): nonterms[a] = OrderedSet([n]) revnonterms[nonterms[a]] = a if n != head: workingset[a] = score(a) + (0,) for n, a in enumerate(tree): if n == head: continue # (add initial unary here) p = newproduction(a, hd) x = score(p) agenda[p] = workingset[p] = x + (x[0],) nonterms[p] = nonterms[a] | nonterms[hd] revnonterms[nonterms[p]] = p while agenda: p, x = agenda.popitem() if nonterms[p] == goal: # (add final unary here) p = ImmutableTree(tree.label, p[:]) p.bitset = tree.bitset return p for p1, y in list(workingset.items()): if p1 not in workingset: continue # this is inefficient. we should have a single query for all # items not overlapping with p elif nonterms[p] & nonterms[p1]: continue # if we do head-driven binarization, add one nonterminal at a time if head is None: p2 = newproduction(p, p1) p2nonterms = nonterms[p] | nonterms[p1] elif len(nonterms[p1]) == 1: p2 = newproduction(p1, p) p2nonterms = nonterms[p1] | nonterms[p] elif len(nonterms[p]) == 1: p2 = newproduction(p, p1) p2nonterms = nonterms[p] | nonterms[p1] else: continue scorep2 = score(p2) # important: the score is the maximum score up till now x2 = max((scorep2, y[:-1], x[:-1])) # add the sum of all previous parsing complexities as last item x2 += (scorep2[0] + x[-1] + y[-1],) # if new or better: # should we allow item when score is equal? if (p2nonterms not in revnonterms or workingset[revnonterms[p2nonterms]] > x2): if p2nonterms in revnonterms: a = revnonterms[p2nonterms] del nonterms[a], workingset[a] if a in agenda: del agenda[a] nonterms[p2] = p2nonterms revnonterms[p2nonterms] = p2 agenda[p2] = workingset[p2] = x2 raise ValueError('agenda exhausted without finding binarization.')