def make_item(self, item1, item2, inverted): """item1 and item2 is always given in the order they appear on the f side""" rule = Rule() rule.f = [item1.nt, item2.nt] fi = item1.fi fj = item2.fj if inverted: rule.lhs = INVERTED rule.e = [item2.nt, item1.nt] rule.e2f = [1, 0] ei = item2.ei ej = item1.ej else: rule.lhs = STRAIGHT rule.e = [item1.nt, item2.nt] rule.e2f = [0, 1] ei = item1.ei ej = item2.ej edge = PhraseHGEdge(rule) edge.add_tail(item1) edge.add_tail(item2) new_item = PhraseHGNode(rule.lhs, fi, fj, ei, ej) new_item.add_incoming(edge) return new_item
def final_glue(self): unattached = self.phrases[:] candidates = self.phrases + self.glue_nodes # topo sort. root node at the end unattached.sort() candidates.sort() self.top_roots = [] self.other_roots = [] while len(candidates) > 0: root = candidates.pop() if (root.fi == 0 and root.fj == self.n1 and root.ei == 0 and root.ej == self.n2): self.top_roots.append(root) else: self.other_roots.append(root) hg = Hypergraph(root) hg.find_reachable_nodes() unattached = [n for n in unattached if id(n) not in hg.found] candidates = [n for n in candidates if id(n) not in hg.found and \ (n.nt == PHRASE or not n < root)] top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2) # add one edge for each top root for root in self.top_roots: rule = Rule() rule.lhs = START rule.f = [root.nt] rule.e = [root.nt] rule.e2f = [0] edge = PhraseHGEdge(rule) edge.add_tail(root) top_node.add_incoming(edge) # add one edge for all other roots if ((glue_missing_phrases or len(self.top_roots) == 0) and len(self.other_roots) > 0): rule = Rule() rule.lhs = START edge = PhraseHGEdge(rule) for root in self.other_roots: rule.f.append(root.nt) rule.e.append(root.nt) edge.add_tail(root) rule.e2f = [i for i in range(len(rule.f))] top_node.add_incoming(edge) return top_node
def make_rule(self, a, source_phrase, fwords): '''fwords is a list of numbers and subphrases: the numbers are indices into the French sentence note by Fang: the input for make_rule is an initial phrase and a possible rule construction, which is plausible only for the f side at this moment. 'make_rule' ensures that the e sides of the subphrases fit into the initial phrase being subtracted and don't overlap. the outputed rule object includes information of the lexicalized symbols at both sides, their indices into the original sentence pair (fpos, epos), and possibly the word alignment info. ''' x, i1, j1, i2, j2 = source_phrase # omit trivial rules if len(fwords) == 1 and type(fwords[0]) is not int: return None #if not tight_phrases: fwords = fwords[:] fpos = [None for w in fwords ] # map from index in phrase to index in sentence ewords = a.ewords[j1:j2] elen = j2 - j1 index = 0 # nonterminal index flag = False for i in range(len(fwords)): fi = fwords[i] if type(fi) is int: # terminal symbol if a.faligned[fi]: flag = True fwords[i] = a.fwords[fi] fpos[i] = fi else: # nonterminal symbol (sub_x, sub_i1, sub_j1, sub_i2, sub_j2) = fi sub_j1 -= j1 sub_j2 -= j1 if not tight_phrases: # Check English holes # can't lie outside phrase if sub_j1 < 0 or sub_j2 > elen: return None # can't overlap for j in range(sub_j1, sub_j2): if type(ewords[j]) is tuple or ewords[j] is None: return None # Set first eword to var, rest to None # We'll clean up the Nones later v = sub_x fwords[i] = v fpos[i] = (sub_i1, sub_i2) ewords[sub_j1] = (v, index, sub_j1 + j1, sub_j2 + j1) for j in range(sub_j1 + 1, sub_j2): ewords[j] = None index += 1 # Require an aligned French word if self.require_aligned_terminal and not flag: return None epos = [] new_ewords = [] e2f = [] for i in range(elen): if ewords[i] is not None: if type(ewords[i]) is tuple: (v, index, ei, ej) = ewords[i] # force slash categories to be at left edge of English side #if force_english_prefix and len(new_ewords) != 0 and sym.clearindex(v) in prefix_labels: # return None e2f.append(index) new_ewords.append(v) epos.append((ei, ej)) else: new_ewords.append(ewords[i]) epos.append(i + j1) #r = XRule(x,rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords))) r = Rule() r.lhs = PHRASE r.f = fwords r.e = new_ewords r.e2f = e2f r.fpos = fpos r.epos = epos r.span = (i1, i2, j1, j2) if self.keep_word_alignments: r.word_alignments = [] for fi in range(len(fpos)): if type(fpos[fi]) is int: for ei in range(len(epos)): if type(epos[ei]) is int: if a.aligned[fpos[fi]][epos[ei]]: r.word_alignments.append((fi, ei)) #print(r) return r