Beispiel #1
0
 def make_item(self, item1, item2, inverted):
     """item1 and item2 is always given in the order they appear
     on the f side"""
     rule = Rule()
     rule.f = [item1.nt, item2.nt]
     fi = item1.fi
     fj = item2.fj
     if inverted:
         rule.lhs = INVERTED
         rule.e = [item2.nt, item1.nt]
         rule.e2f = [1, 0]
         ei = item2.ei
         ej = item1.ej
     else:
         rule.lhs = STRAIGHT
         rule.e = [item1.nt, item2.nt]
         rule.e2f = [0, 1]
         ei = item1.ei
         ej = item2.ej
     edge = PhraseHGEdge(rule)
     edge.add_tail(item1)
     edge.add_tail(item2)
     new_item = PhraseHGNode(rule.lhs, fi, fj, ei, ej)
     new_item.add_incoming(edge)
     return new_item
Beispiel #2
0
 def final_glue(self):
     unattached = self.phrases[:]
     candidates = self.phrases + self.glue_nodes
     # topo sort. root node at the end
     unattached.sort()
     candidates.sort()
     self.top_roots = []
     self.other_roots = []
     while len(candidates) > 0:
         root = candidates.pop()
         if (root.fi == 0 and
             root.fj == self.n1 and
             root.ei == 0 and
             root.ej == self.n2):
             self.top_roots.append(root)
         else:
             self.other_roots.append(root)
         hg = Hypergraph(root)
         hg.find_reachable_nodes()
         unattached = [n for n in unattached if id(n) not in hg.found]
         candidates = [n for n in candidates if id(n) not in hg.found and \
                       (n.nt == PHRASE or not n < root)]
     top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2)
     # add one edge for each top root
     for root in self.top_roots:
         rule = Rule()
         rule.lhs = START
         rule.f = [root.nt]
         rule.e = [root.nt]
         rule.e2f = [0]
         edge = PhraseHGEdge(rule)
         edge.add_tail(root)
         top_node.add_incoming(edge)
     # add one edge for all other roots
     if ((glue_missing_phrases or len(self.top_roots) == 0)
         and len(self.other_roots) > 0):
         rule = Rule()
         rule.lhs = START
         edge = PhraseHGEdge(rule)
         for root in self.other_roots:
             rule.f.append(root.nt)
             rule.e.append(root.nt)
             edge.add_tail(root)
         rule.e2f = [i for i in range(len(rule.f))]
         top_node.add_incoming(edge)
     return top_node
Beispiel #3
0
    def make_rule(self, a, source_phrase, fwords):
        '''fwords is a list of numbers and subphrases:
           the numbers are indices into the French sentence

           note by Fang: the input for make_rule is an initial phrase and a
           possible rule construction, which is plausible only for the f side
           at this moment. 'make_rule' ensures that the e sides of the
           subphrases fit into the initial phrase being subtracted and don't
           overlap.  the outputed rule object includes information of the
           lexicalized symbols at both sides, their indices into the original
           sentence pair (fpos, epos), and possibly the word alignment info.
           '''
        x, i1, j1, i2, j2 = source_phrase

        # omit trivial rules
        if len(fwords) == 1 and type(fwords[0]) is not int:
            return None

        #if not tight_phrases:
        fwords = fwords[:]
        fpos = [None for w in fwords
                ]  # map from index in phrase to index in sentence
        ewords = a.ewords[j1:j2]
        elen = j2 - j1
        index = 0  # nonterminal index
        flag = False
        for i in range(len(fwords)):
            fi = fwords[i]
            if type(fi) is int:  # terminal symbol
                if a.faligned[fi]:
                    flag = True
                fwords[i] = a.fwords[fi]
                fpos[i] = fi
            else:  # nonterminal symbol
                (sub_x, sub_i1, sub_j1, sub_i2, sub_j2) = fi
                sub_j1 -= j1
                sub_j2 -= j1

                if not tight_phrases:
                    # Check English holes
                    # can't lie outside phrase
                    if sub_j1 < 0 or sub_j2 > elen:
                        return None

                    # can't overlap
                    for j in range(sub_j1, sub_j2):
                        if type(ewords[j]) is tuple or ewords[j] is None:
                            return None

                # Set first eword to var, rest to None

                # We'll clean up the Nones later
                v = sub_x
                fwords[i] = v
                fpos[i] = (sub_i1, sub_i2)
                ewords[sub_j1] = (v, index, sub_j1 + j1, sub_j2 + j1)
                for j in range(sub_j1 + 1, sub_j2):
                    ewords[j] = None
                index += 1

        # Require an aligned French word
        if self.require_aligned_terminal and not flag:
            return None

        epos = []
        new_ewords = []
        e2f = []
        for i in range(elen):
            if ewords[i] is not None:
                if type(ewords[i]) is tuple:
                    (v, index, ei, ej) = ewords[i]
                    # force slash categories to be at left edge of English side
                    #if force_english_prefix and len(new_ewords) != 0 and sym.clearindex(v) in prefix_labels:
                    #    return None
                    e2f.append(index)
                    new_ewords.append(v)
                    epos.append((ei, ej))
                else:
                    new_ewords.append(ewords[i])
                    epos.append(i + j1)

        #r = XRule(x,rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords)))
        r = Rule()
        r.lhs = PHRASE
        r.f = fwords
        r.e = new_ewords
        r.e2f = e2f
        r.fpos = fpos
        r.epos = epos
        r.span = (i1, i2, j1, j2)

        if self.keep_word_alignments:
            r.word_alignments = []
            for fi in range(len(fpos)):
                if type(fpos[fi]) is int:
                    for ei in range(len(epos)):
                        if type(epos[ei]) is int:
                            if a.aligned[fpos[fi]][epos[ei]]:
                                r.word_alignments.append((fi, ei))
        #print(r)
        return r