コード例 #1
0
ファイル: tree.py プロジェクト: steve3p0/LING511
    def embed_n_np(self, t: nltk.Tree):

        # RB - Adverb
        # RBR - Adverb, comparative
        # RBS - Adverb, superlative

        try:
            t.label()
        except AttributeError:
            # print(t)
            return

        try:
            for child in t:
                #t = nltk.ParentedTree.convert(t)
                if child.label() == child.right_sibling().label() == "NN":
                    # noun = child
                    noun = nltk.ParentedTree("NN", [child[0]])

                    np = nltk.ParentedTree("NP", [noun])
                    child_pos = self.get_position(child, t)
                    t.remove(child)
                    t.insert(child_pos, np)

                    t = nltk.ParentedTree.convert(t)
                    parent = t.parent()
                    parent = nltk.ParentedTree.convert(parent)
        except Exception:
            #print("swallow hard!")
            pass

        for child in t:
            self.embed_n_np(child)
コード例 #2
0
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
    f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
    #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
    sentID = base
    for i in xrange(len(srcSnt)):
        #print wa[i], srcSnt[i], tgtSnt[i]
        if i % 1000 == 0: print >> sys.stderr, i,
        bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
          oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

        for suba in bead.otherSuba:
            example = (features(bead, suba), False,
                       str(sentID) + '--' + suba.__str__()
                       )  # add negative training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        for suba in bead.goldSuba:
            example = (features(bead, suba), True,
                       str(sentID) + '--' + suba.__str__()
                       )  # add positive training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        #f.write('\n')
        sentID += 1
    f.close()
コード例 #3
0
    def test_get_head_s(self):
        parse = """(S
  (S
    (NP
      (NP
        (DT The)
        (ADJP (RBS most) (JJ important))
        (JJ Taiwanese)
        (JJ musical)
        (NN master))
      (PP (IN of) (NP (DT the) (JJ last) (JJ half) (NN century)))))
  (, ,)
  (NP (PRP he))
  (VP
    (VBD was)
    (NP
      (NP (DT a) (JJ beloved) (NN teacher))
      (PP (IN to) (NP (JJ many)))))
  (. .))"""

        self.assertEqual(nltk.ParentedTree("VBD", ["was"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))

        parse_2 = "(S (`` `) (NP (NNP Bus) (NNP Stop) (POS ')))"

        self.assertEqual(nltk.ParentedTree("NNP", ["Stop"]),
                         self.head_finder.get_head(
                             nltk.ParentedTree.fromstring(parse_2)))
コード例 #4
0
    def test_get_difficult_heads(self):
        parse = """(NP
  (S
    (VP
      (VP
        (VBG recalling)
        (NP (DT the) (JJ Korean) (NN delegation))
        (PP
          (IN to)
          (NP
            (DT the)
            (NNP Korean)
            (NML (NNP Military) (NNP Armistice))
            (NNP Commission))))
      (CC and)
      (VP
        (VBG setting)
        (PRT (RP up))
        (NP
          (NP
            (DT the)
            (NNP Panmunjom)
            (NNP Representative)
            (NNP Office))
          (PP
            (IN of)
            (NP
              (NP (DT the) (NNP Korean) (NNPS People) (POS 's))
              (NNP Army))))
        (PP (IN as) (NP (DT the) (JJ negotiatory) (NN organization))))))
  (, ,)
  (ADVP (FW etc)))"""

        parse2 = """(NP
  (QP (NNS Tens) (IN of) (NNS thousands))
  (PP (IN of) (NP (NNS people))))"""

        parse3 = """(NP
  (PRP he)
  (PRN
    (-LRB- -LRB-)
    (NP
      (NP (DT the) (NN one))
      (SBAR
        (WHNP (WP who))
        (S (VP (VBD tricked) (NP (DT these) (NNS people))))))
    (-RRB- -RRB-)))"""

        parse4 = """(UCP
  (NP (NN %um))
  (CC and)
  (S (NP (PRP you)) (VP (MD can) (VP (ADVP (RB also))))))"""

        self.assertEqual(nltk.ParentedTree("FW", ["etc"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
        self.assertEqual(nltk.ParentedTree("NNS", ["Tens"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse2)))
        self.assertEqual(nltk.ParentedTree("-LRB-", ["-LRB-"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse3)))
        self.assertEqual(nltk.ParentedTree("MD", ["can"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse4)))
コード例 #5
0
def loadData(srcTrList, tgtTrList, waList, alignFunc, ruleExFlag,
             wordRulesFlag, minMemFlag, procID, verbose, extensiveRulesFlag,
             fractionalCountFlag, phraseRulesFlag, s2t):
    if minMemFlag:
        if 'hacept' not in os.listdir('/dev/shm'):
            os.mkdir('/dev/shm/hacept')
        f1 = codecs.open('/dev/shm/hacept/rule.' + str(procID), 'w', 'utf-8')
        f2 = codecs.open('/dev/shm/hacept/ruleInv.' + str(procID), 'w',
                         'utf-8')
        gf1 = codecs.open('/dev/shm/hacept/glueRule.' + str(procID), 'w',
                          'utf-8')
    else:
        result = []

    basicGlueRuleTopLabels, basicGlueRuleLabels = set([]), set([])

    for i in xrange(len(waList)):
        srcTr = nltk.ParentedTree(srcTrList[i])
        tgtTr = nltk.ParentedTree(tgtTrList[i])
        wa = [item.split('-') for item in waList[i].split()]
        wa = [([int(i) for i in item[0].split(',')],
               [int(j) for j in item[1].split(',')]) for item in wa]

        if minMemFlag:
            if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0:
                continue
            else:
                tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag,
                                       wordRulesFlag, extensiveRulesFlag,
                                       fractionalCountFlag, phraseRulesFlag,
                                       s2t, verbose)
                for rule in tmpSntFrame.ruleList:
                    r, rinv = rule[0], rule[1]
                    #r, rinv = rule.mosesFormatRule()
                    f1.write(r)
                    f2.write(rinv)
                if s2t:
                    for rule in tmpSntFrame.glueRuleList:
                        r, rinv = rule[0], rule[1]
                        gf1.write(r)
        else:
            if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0:
                result.append(None)
            else:
                tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag,
                                       wordRulesFlag, extensiveRulesFlag,
                                       fractionalCountFlag, phraseRulesFlag,
                                       s2t, verbose)
                result.append(tmpSntFrame)

        if s2t:
            basicGlueRuleTopLabels.update(tmpSntFrame.basicGlueRuleTopLabels)
            basicGlueRuleLabels.update(tmpSntFrame.basicGlueRuleLabels)

    if minMemFlag: return [None], basicGlueRuleTopLabels, basicGlueRuleLabels
    else: return result, basicGlueRuleTopLabels, basicGlueRuleLabels
コード例 #6
0
 def test_get_head_sq(self):
     self.assertEqual(
         nltk.ParentedTree("VBP", ["are"]),
         self.head_finder.get_head(
             nltk.ParentedTree.fromstring(
                 "(SQ (VBP are) (NP (PRP they)) (NP (DT all) (NNS liars)))")
         ))
コード例 #7
0
    def test_get_head_sbarq(self):
        parse = """(SBARQ
  (WHADVP (WRB Where))
  (SQ (MD Should) (NP (NNP Chinese) (NNP Music)) (VP (VB Go)))
  (. ?))"""

        self.assertEqual(nltk.ParentedTree("MD", ["Should"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
コード例 #8
0
    def test_get_head_pp(self):
        parse = """(PP
  (IN of)
  (NP
    (NP (NNS thousands))
    (PP (IN of) (NP (JJ non-profit) (NNS institutions)))))"""

        self.assertEqual(nltk.ParentedTree("IN", ["of"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
コード例 #9
0
def isLegalTree(line, i):
    try:
        t = nltk.Tree(line)
        pt = nltk.ParentedTree(line)
    except ValueError:
        print >> sys.stderr, "illegal tree!!! #" + str(i)
        print >> sys.stderr, line
        exit(1)
コード例 #10
0
    def test_get_head_sbar(self):
        parse = """(SBAR
  (WHNP (WP who))
  (S
    (VP
      (VBD had)
      (VP
        (VBN had)
        (NP (NP (JJ enough)) (PP (IN of) (NP (NN schooling))))))))"""

        self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
コード例 #11
0
    def test_head_rule_cc(self):
        parse = """(NP
        (NP
            (NNS ruin))
        (CC and)
        (NP
            (NNS terror)))
        """

        self.assertEqual(nltk.ParentedTree("CC", ["and"]),
                         self.head_finder.get_head(nltk.ParentedTree.fromstring(
                parse)))
コード例 #12
0
 def test_get_head_np(self):
     self.assertEqual(nltk.ParentedTree("NNS", ["police"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NNS police))")))
     self.assertEqual(nltk.ParentedTree("NN", ["shop"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NN shop))")))
     self.assertEqual(nltk.ParentedTree("NNP", ["NBC"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP NBC) (POS 's))")))
     self.assertEqual(nltk.ParentedTree("NN", ["wedding"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (PRP$ his) (NN brother) (POS 's)) (NN wedding)) (PP (IN in) (NP (NNP Khan) (NNPS Younes))))")))
     self.assertEqual(nltk.ParentedTree("NNP", ["Taiwan"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP Taiwan) (POS 's))")))
     self.assertEqual(nltk.ParentedTree("NN", ["port"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (NNP Yemen) (POS 's)) (NN port)) (PP (IN of) (NP (NNP Aden))))")))
コード例 #13
0
def flatten2one(tr):
    newLine = '(' + tr.node + ' '
    for subt in tr.subtrees():
        if subt.height() == 2:
            if isinstance(subt.node, str) and isinstance(subt[0], str):
                newLine += '(' + subt.node + ' ' + subt[0] + ') '
            else:
                print subt
                exit(1)
    newLine += ')'
    newTr = nltk.ParentedTree(newLine)
    #print 'newTr is: ', newTr
    return newTr
コード例 #14
0
    def test_get_head_frag(self):
        parse = """(FRAG
  (PP (IN On) (NP (DT the) (NN internet) (NN type)))
  (NP (NNP Iraq))
  (: :)
  (NP (NNP Beyond) (NNP Abu) (NNP Ghraib))
  (: :)
  (NP
    (NP (NN Detention) (CC and) (NN torture))
    (PP (IN in) (NP (NNP Iraq))))
  (. .))"""

        self.assertEqual(nltk.ParentedTree(".", ["."]), self.head_finder.get_head(
            nltk.ParentedTree.fromstring(
            parse)))
コード例 #15
0
def grow_branches(starting_node, from_list):
    """ Recursively grows top-down subtree for an SST phrase.
	
	If the node label objects contain information about contained leaves,
	the branches will be ordered left-to-right to maintain sentence order."""

    # tokens are leaves, no more growth from there
    """if not isinstance(starting_node, nltk.Tree):
		print(type(starting_node), starting_node)"""
    if type(starting_node) is SST_Token:
        return starting_node

    # print(len(from_list), starting_node)
    children = starting_node.children
    # if contained-leaf data was collected, order the subtrees left-to-right.
    children = sorted(children, key=lambda c: min(c.get_leaf_indices()))

    # recursively grow subtrees
    branches = list(grow_branches(child, from_list) for child in children)
    subtree = nltk.ParentedTree(starting_node, branches)
    return subtree
コード例 #16
0
ファイル: tree.py プロジェクト: steve3p0/LING511
    def convert_adv_deg(self, t: nltk.Tree):

        # RB - Adverb
        # RBR - Adverb, comparative
        # RBS - Adverb, superlative

        try:
            t.label()
        except AttributeError:
            # print(t)
            return

        if t.label() in ["ADJP", "ADVP"]:
            phrase = t

            try:
                if phrase[0].label() == "RB" and \
                   phrase[1].label() in ["RB", "JJ"]:
                    #t = nltk.ParentedTree.convert(t)
                    adv = phrase[0]
                    if adv[0] in ["too", "very"]:
                        if len(t) > 1:
                            if adv.right_sibling().label() in ["RB", "JJ"]:
                                deg = nltk.ParentedTree("Deg", [adv[0]])
                                t.remove(t[0])
                                t.insert(0, deg)

                                t = nltk.ParentedTree.convert(t)
                                parent = t.parent()
                                parent = nltk.ParentedTree.convert(parent)
            except:
                #print("swallow hard!")
                pass

        for child in t:
            self.convert_adv_deg(child)
コード例 #17
0
def flatten(line, level):
    tr = nltk.ParentedTree(line.strip())
    #if the tree's height is smaller or equal to 3, no changes are needed
    if tr.height() <= 3:
        return ' '.join(tr.pprint().split())

    # if level is greater than tree's height, set it to tree's height, which means the most flattening
    if level > tr.height():
        level = tr.height()

    #print 'tr is: ', tr
    for subt in tr.subtrees():
        #print 'subt is: ', subt, 'height is: ', subt.height(),
        if subt.height() == level:
            #print 'yes!'
            if subt.parent():
                subt.parent()[subt.parent_index()] = flatten2one(subt)
            else:
                tr = flatten2one(subt)
                break
        else:
            #print 'no!'
            continue
    return ' '.join(tr.pprint().split())
コード例 #18
0
 def test_get_head_ucp(self):
     self.assertEqual(nltk.ParentedTree("NN", ["trade"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(UCP (JJ economic) (CC and) (NN trade))")))
コード例 #19
0
 def test_get_head_vp(self):
     self.assertEqual(nltk.ParentedTree("VB", ["shoot"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(VP (VB shoot))")))
コード例 #20
0
 def test_get_head_x(self):
     self.assertEqual(nltk.ParentedTree(":", ["--"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(X (NNS Men) (CC or) (: --))")))
コード例 #21
0
 def test_get_head_intj(self):
     self.assertEqual(nltk.ParentedTree("UH", ["oh"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(INTJ (UH oh) (PRP$ my) (NNP god))")))
コード例 #22
0
 def test_get_head_adjp(self):
     self.assertEqual(nltk.ParentedTree("JJ" ,["twelfth"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADJP (JJ twelfth) (CC and) (JJ thirteenth))")))
コード例 #23
0
 def test_get_head_whnp(self):
     self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHNP (WP who))")))
コード例 #24
0
 def test_get_head_advp(self):
     self.assertEqual(nltk.ParentedTree("RB", ["here"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADVP (RB here))")))
コード例 #25
0
 def test_get_head_qp(self):
     self.assertEqual(nltk.ParentedTree("CD", ["forty"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(QP (CD forty) (HYPH -) (CD five))")))
コード例 #26
0
 def test_get_head_whadvp(self):
     self.assertEqual(nltk.ParentedTree("WRB", ["how"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHADVP (WRB how))")))
コード例 #27
0
    def loadData(cls, filename):
        """
		Return a list of Bead instances.
		Load trees, word alignments, and subtree alignments from a file, and create Bead instances. 

		:type filename: str
		:param filename: a file with trees, word alignments, and subtree alignments for multiple sentence pairs.

		"""
        import codecs
        f = codecs.open(filename, 'r', 'utf-8')
        blocks = util.cleanData(f.read()).split('\n\n')
        f.close()

        beadList = []

        srcTree = None
        tgtTree = None
        wordAlignment = None
        subtreeAlignment = []

        for block in blocks[:-1]:
            block = block.split('\n')
            i = 0
            errFlag = False
            while i < len(block):
                line = block[i]
                if line.startswith('SOURCE'):
                    #print line
                    if line[8:].startswith('ERROR'):
                        errFlag = True
                        break
                    srcTree = nltk.ParentedTree(line[8:])
                    if srcTree.leaves() == []:
                        errFlag = True
                        break
                    #print srcTree.leaves()
                    i += 1
                    continue

                elif line.startswith('TARGET'):
                    #print line
                    tgtTree = nltk.ParentedTree(line[8:])
                    #print tgtTree
                    #print tgtTree.leaves()
                    if tgtTree.leaves() == []:
                        errFlag = True
                        break
                    wordAlignment = [[
                        0 for j in xrange(len(tgtTree.leaves()))
                    ] for k in xrange(len(srcTree.leaves()))]
                    i += 1
                    #print i
                    continue

                elif line.startswith('<mapping>'):
                    #print "in mapping..."
                    i += 1
                    line = block[i]
                    #print
                    #print ' '.join([item.encode('utf-8') for item in srcTree.leaves()])
                    #print
                    #print srcTree
                    #print srcSubtreeIndex
                    #print
                    #print tgtTree
                    #print tgtSubtreeIndex
                    while not line.startswith('</mapping>'):
                        x1 = int(line.split()[0].split(',')[0]) - 1
                        x2 = int(line.split()[0].split(',')[-1])

                        y1 = int(line.split()[1].split(',')[0]) - 1
                        y2 = int(line.split()[1].split(',')[-1])

                        subtreeAlignment.append((x1, y1, x2, y2))
                        i += 1
                        line = block[i]

                    i += 1
                    continue

                elif line.startswith('<alignment>'):
                    i += 1
                    line = block[i]
                    #print len(wordAlignment), len(wordAlignment[0])
                    while not line.startswith('</alignment>'):
                        #print line
                        srcIndexes = [
                            int(item) - 1
                            for item in line.split()[0].split(',')
                            if int(item) != -1
                        ]
                        tgtIndexes = [
                            int(item) - 1
                            for item in line.split()[1].split(',')
                            if int(item) != -1
                        ]
                        for srcIndex in srcIndexes:
                            for tgtIndex in tgtIndexes:
                                wordAlignment[srcIndex][tgtIndex] = 1
                        i += 1
                        line = block[i]

                    i += 1
                    continue

                elif line.startswith('</bead>'):
                    break

                i += 1

            if not errFlag:
                beadList.append(
                    cls(srcTree, tgtTree, wordAlignment, subtreeAlignment,
                        False, False, False, False, False))
            srcTree, tgtTree, wordAlignment, subtreeAlignment = None, None, None, []

        return beadList
コード例 #28
0
 def test_get_head_nml(self):
     self.assertEqual(nltk.ParentedTree("NN", ["curtain"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NML (NN air) (NN curtain))")))