def refineML_TopLevelMatch(ml):
    if PRINT_DEBUG:
        for m in ml:
            print "Phrase match\t%s\t-> %s\t\t(%s)\t-> (%s)" % (
                m[0].tag(), m[1].tag(), nodeText(m[0]), nodeText(
                    m[1])), topLevelPhrasesMatch(m)
    return [m for m in ml if topLevelPhrasesMatch(m)]
def linkIdenticalWords(n1, n2, matchList):
    """
    Matches nodes in the two trees where the words match, even if the node labels do not.
    Using wordsMatch which includes stemming.
    @return Pair of matched nodes
    """
    assert not n1.isLeaf(), "Original sentence node is a single leaf"
    assert not n2.isLeaf(), "Highlight sentence node is a single leaf"
    ch1List = [(ch, nodeText(ch)) for ch in n1]
    ch2List = [(ch, nodeText(ch)) for ch in n2]
    if False and PRINT_DEBUG:
        print "ch1"
        for ch in ch1List:
            print ch[0].tag(), ch[0], ch[1]
        print "ch2"
        for ch in ch2List:
            print ch[0], ch[1]

    # do loop explicitly to catch recursion errors and carry on
    matching = []
    for (ch1, txt1) in ch1List:
        for (ch2, txt2) in ch2List:
            try:
                if (ch1, ch2) not in matchList and wordsMatch(
                        txt1, txt2, ch1.tag(), ch2.tag()):
                    matching.append((ch1, ch2))
            except RuntimeError:
                # recursion problem in matching trees
                pass

    # if len(matching)>0:
    #     print "\n\n\nMatching: ", matching
    #    raise SystemExit, "Need to add POS tags"
    return matching
def linkParentNodes(matchList, matchListNoStopList, maxDepth=1):
    """
    Considers pairs of matched nodes in the match list.
    If two nodes have the same parent, then this parent
    is matched to the ancestor of the corresponding
    highlight nodes.
    
    A maxDepth=1 allows siblings to be matched
    """
    matchingParentNodes = []
    if PRINT_DEBUG:
        print "\nInvestigating node distances"

        assert len(matchList) > 0, "No matches to work with"
        srcroot = matchList[0][0].root()
        hiroot = matchList[0][1].root()
        print "Roots: ", id(srcroot), "---", id(hiroot)
        for ms, mh in matchList:
            print id(ms.root()), "---", id(mh.root())
            if not id(ms.root()) == id(srcroot): print "src!!!"
            if not id(mh.root()) == id(hiroot):
                print "hi!!!"
                print "Original tree:"
                print nodeText(hiroot)
                print hiroot
                print "New tree involving:", mh
                print nodeText(mh.root())
                print mh.root()

    for (s1, h1) in matchList:  # was matchListNoStopList
        # for (s2, h2) in matchListNoStopList[i+1:]:
        for (s2, h2) in matchListNoStopList:
            d = nodeDistance(s1, s2)
            if PRINT_DEBUG:
                print "nodeDistance: ", d, "\t\t", s1.treeposition(
                ), "---", s2.treeposition(
                ), "\t; highlights\t", h1.treeposition(
                ), "---", h2.treeposition(),
            if d <= maxDepth:
                ca1 = s1.root()[commonAncestor(s1, s2)]
                ca2 = h1.root()[commonAncestor(h1, h2)]
                if (ca1, ca2) in matchList: continue
                if (ca1, ca2) in matchingParentNodes: continue
                if PRINT_DEBUG:
                    print
                    print "Original:  ", nodeText(s1), " --- ", nodeText(
                        s2), "\tdistance ", d
                    print "Highlight: ", nodeText(h1), " --- ", nodeText(
                        h2), nodeDistance(h1, h2)
                    print "Ancestors in match list? ", (ca1, ca2) in matchList
                    print "Original parent phrase: ", nodeText(ca1)
                    print "Highlight parent phrase:", nodeText(ca2)

                matchingParentNodes.append((ca1, ca2))
    return matchingParentNodes
def _refineML_RemoveMissedProperNounsTest(m):
    """
    Return True if the match conveys any NP present in the source tree
    """
    if m[0].isLeaf() or m[1].isLeaf(): return True
    # looking for S where NP isn't included
    # if m[0].tag()=="S" and m[1].tag():
    srcTags = [ch.tag() for ch in m[0]]
    tgtTags = [ch.tag() for ch in m[1]]

    if "NP" not in srcTags and "NP" in tgtTags:
        # TODO: check that NP actually contains proper nouns as children
        if PRINT_DEBUG:
            # print "Missing NP"
            print nodeText(m[0])
            print nodeText(m[1])
            #raise SystemExit
        return False
    return True
Beispiel #5
0
def printAllMatchListInfo(matchList):
    print "\nMatch list:"
    for (ph1, ph2) in matchList:
        print ph1.treeposition(), "---", ph2.treeposition(), "\t\t", ph1.tag(
        ), nodeText(ph1), " --- ", ph2.tag(), nodeText(ph2)
    print