def embed_n_np(self, t: nltk.Tree): # RB - Adverb # RBR - Adverb, comparative # RBS - Adverb, superlative try: t.label() except AttributeError: # print(t) return try: for child in t: #t = nltk.ParentedTree.convert(t) if child.label() == child.right_sibling().label() == "NN": # noun = child noun = nltk.ParentedTree("NN", [child[0]]) np = nltk.ParentedTree("NP", [noun]) child_pos = self.get_position(child, t) t.remove(child) t.insert(child_pos, np) t = nltk.ParentedTree.convert(t) parent = t.parent() parent = nltk.ParentedTree.convert(parent) except Exception: #print("swallow hard!") pass for child in t: self.embed_n_np(child)
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base): f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8') #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8') sentID = base for i in xrange(len(srcSnt)): #print wa[i], srcSnt[i], tgtSnt[i] if i % 1000 == 0: print >> sys.stderr, i, bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \ oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i])) for suba in bead.otherSuba: example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__() ) # add negative training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') for suba in bead.goldSuba: example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__() ) # add positive training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') #f.write('\n') sentID += 1 f.close()
def test_get_head_s(self): parse = """(S (S (NP (NP (DT The) (ADJP (RBS most) (JJ important)) (JJ Taiwanese) (JJ musical) (NN master)) (PP (IN of) (NP (DT the) (JJ last) (JJ half) (NN century))))) (, ,) (NP (PRP he)) (VP (VBD was) (NP (NP (DT a) (JJ beloved) (NN teacher)) (PP (IN to) (NP (JJ many))))) (. .))""" self.assertEqual(nltk.ParentedTree("VBD", ["was"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse))) parse_2 = "(S (`` `) (NP (NNP Bus) (NNP Stop) (POS ')))" self.assertEqual(nltk.ParentedTree("NNP", ["Stop"]), self.head_finder.get_head( nltk.ParentedTree.fromstring(parse_2)))
def test_get_difficult_heads(self): parse = """(NP (S (VP (VP (VBG recalling) (NP (DT the) (JJ Korean) (NN delegation)) (PP (IN to) (NP (DT the) (NNP Korean) (NML (NNP Military) (NNP Armistice)) (NNP Commission)))) (CC and) (VP (VBG setting) (PRT (RP up)) (NP (NP (DT the) (NNP Panmunjom) (NNP Representative) (NNP Office)) (PP (IN of) (NP (NP (DT the) (NNP Korean) (NNPS People) (POS 's)) (NNP Army)))) (PP (IN as) (NP (DT the) (JJ negotiatory) (NN organization)))))) (, ,) (ADVP (FW etc)))""" parse2 = """(NP (QP (NNS Tens) (IN of) (NNS thousands)) (PP (IN of) (NP (NNS people))))""" parse3 = """(NP (PRP he) (PRN (-LRB- -LRB-) (NP (NP (DT the) (NN one)) (SBAR (WHNP (WP who)) (S (VP (VBD tricked) (NP (DT these) (NNS people)))))) (-RRB- -RRB-)))""" parse4 = """(UCP (NP (NN %um)) (CC and) (S (NP (PRP you)) (VP (MD can) (VP (ADVP (RB also))))))""" self.assertEqual(nltk.ParentedTree("FW", ["etc"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse))) self.assertEqual(nltk.ParentedTree("NNS", ["Tens"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse2))) self.assertEqual(nltk.ParentedTree("-LRB-", ["-LRB-"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse3))) self.assertEqual(nltk.ParentedTree("MD", ["can"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse4)))
def loadData(srcTrList, tgtTrList, waList, alignFunc, ruleExFlag, wordRulesFlag, minMemFlag, procID, verbose, extensiveRulesFlag, fractionalCountFlag, phraseRulesFlag, s2t): if minMemFlag: if 'hacept' not in os.listdir('/dev/shm'): os.mkdir('/dev/shm/hacept') f1 = codecs.open('/dev/shm/hacept/rule.' + str(procID), 'w', 'utf-8') f2 = codecs.open('/dev/shm/hacept/ruleInv.' + str(procID), 'w', 'utf-8') gf1 = codecs.open('/dev/shm/hacept/glueRule.' + str(procID), 'w', 'utf-8') else: result = [] basicGlueRuleTopLabels, basicGlueRuleLabels = set([]), set([]) for i in xrange(len(waList)): srcTr = nltk.ParentedTree(srcTrList[i]) tgtTr = nltk.ParentedTree(tgtTrList[i]) wa = [item.split('-') for item in waList[i].split()] wa = [([int(i) for i in item[0].split(',')], [int(j) for j in item[1].split(',')]) for item in wa] if minMemFlag: if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0: continue else: tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag, wordRulesFlag, extensiveRulesFlag, fractionalCountFlag, phraseRulesFlag, s2t, verbose) for rule in tmpSntFrame.ruleList: r, rinv = rule[0], rule[1] #r, rinv = rule.mosesFormatRule() f1.write(r) f2.write(rinv) if s2t: for rule in tmpSntFrame.glueRuleList: r, rinv = rule[0], rule[1] gf1.write(r) else: if len(srcTr.leaves()) == 0 or len(tgtTr.leaves()) == 0: result.append(None) else: tmpSntFrame = SntFrame(srcTr, tgtTr, wa, alignFunc, ruleExFlag, wordRulesFlag, extensiveRulesFlag, fractionalCountFlag, phraseRulesFlag, s2t, verbose) result.append(tmpSntFrame) if s2t: basicGlueRuleTopLabels.update(tmpSntFrame.basicGlueRuleTopLabels) basicGlueRuleLabels.update(tmpSntFrame.basicGlueRuleLabels) if minMemFlag: return [None], basicGlueRuleTopLabels, basicGlueRuleLabels else: return result, basicGlueRuleTopLabels, basicGlueRuleLabels
def test_get_head_sq(self): self.assertEqual( nltk.ParentedTree("VBP", ["are"]), self.head_finder.get_head( nltk.ParentedTree.fromstring( "(SQ (VBP are) (NP (PRP they)) (NP (DT all) (NNS liars)))") ))
def test_get_head_sbarq(self): parse = """(SBARQ (WHADVP (WRB Where)) (SQ (MD Should) (NP (NNP Chinese) (NNP Music)) (VP (VB Go))) (. ?))""" self.assertEqual(nltk.ParentedTree("MD", ["Should"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
def test_get_head_pp(self): parse = """(PP (IN of) (NP (NP (NNS thousands)) (PP (IN of) (NP (JJ non-profit) (NNS institutions)))))""" self.assertEqual(nltk.ParentedTree("IN", ["of"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
def isLegalTree(line, i): try: t = nltk.Tree(line) pt = nltk.ParentedTree(line) except ValueError: print >> sys.stderr, "illegal tree!!! #" + str(i) print >> sys.stderr, line exit(1)
def test_get_head_sbar(self): parse = """(SBAR (WHNP (WP who)) (S (VP (VBD had) (VP (VBN had) (NP (NP (JJ enough)) (PP (IN of) (NP (NN schooling))))))))""" self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring(parse)))
def test_head_rule_cc(self): parse = """(NP (NP (NNS ruin)) (CC and) (NP (NNS terror))) """ self.assertEqual(nltk.ParentedTree("CC", ["and"]), self.head_finder.get_head(nltk.ParentedTree.fromstring( parse)))
def test_get_head_np(self): self.assertEqual(nltk.ParentedTree("NNS", ["police"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NNS police))"))) self.assertEqual(nltk.ParentedTree("NN", ["shop"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (JJ Local) (NN shop))"))) self.assertEqual(nltk.ParentedTree("NNP", ["NBC"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP NBC) (POS 's))"))) self.assertEqual(nltk.ParentedTree("NN", ["wedding"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (PRP$ his) (NN brother) (POS 's)) (NN wedding)) (PP (IN in) (NP (NNP Khan) (NNPS Younes))))"))) self.assertEqual(nltk.ParentedTree("NNP", ["Taiwan"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NNP Taiwan) (POS 's))"))) self.assertEqual(nltk.ParentedTree("NN", ["port"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NP (NP (NP (NNP Yemen) (POS 's)) (NN port)) (PP (IN of) (NP (NNP Aden))))")))
def flatten2one(tr): newLine = '(' + tr.node + ' ' for subt in tr.subtrees(): if subt.height() == 2: if isinstance(subt.node, str) and isinstance(subt[0], str): newLine += '(' + subt.node + ' ' + subt[0] + ') ' else: print subt exit(1) newLine += ')' newTr = nltk.ParentedTree(newLine) #print 'newTr is: ', newTr return newTr
def test_get_head_frag(self): parse = """(FRAG (PP (IN On) (NP (DT the) (NN internet) (NN type))) (NP (NNP Iraq)) (: :) (NP (NNP Beyond) (NNP Abu) (NNP Ghraib)) (: :) (NP (NP (NN Detention) (CC and) (NN torture)) (PP (IN in) (NP (NNP Iraq)))) (. .))""" self.assertEqual(nltk.ParentedTree(".", ["."]), self.head_finder.get_head( nltk.ParentedTree.fromstring( parse)))
def grow_branches(starting_node, from_list): """ Recursively grows top-down subtree for an SST phrase. If the node label objects contain information about contained leaves, the branches will be ordered left-to-right to maintain sentence order.""" # tokens are leaves, no more growth from there """if not isinstance(starting_node, nltk.Tree): print(type(starting_node), starting_node)""" if type(starting_node) is SST_Token: return starting_node # print(len(from_list), starting_node) children = starting_node.children # if contained-leaf data was collected, order the subtrees left-to-right. children = sorted(children, key=lambda c: min(c.get_leaf_indices())) # recursively grow subtrees branches = list(grow_branches(child, from_list) for child in children) subtree = nltk.ParentedTree(starting_node, branches) return subtree
def convert_adv_deg(self, t: nltk.Tree): # RB - Adverb # RBR - Adverb, comparative # RBS - Adverb, superlative try: t.label() except AttributeError: # print(t) return if t.label() in ["ADJP", "ADVP"]: phrase = t try: if phrase[0].label() == "RB" and \ phrase[1].label() in ["RB", "JJ"]: #t = nltk.ParentedTree.convert(t) adv = phrase[0] if adv[0] in ["too", "very"]: if len(t) > 1: if adv.right_sibling().label() in ["RB", "JJ"]: deg = nltk.ParentedTree("Deg", [adv[0]]) t.remove(t[0]) t.insert(0, deg) t = nltk.ParentedTree.convert(t) parent = t.parent() parent = nltk.ParentedTree.convert(parent) except: #print("swallow hard!") pass for child in t: self.convert_adv_deg(child)
def flatten(line, level): tr = nltk.ParentedTree(line.strip()) #if the tree's height is smaller or equal to 3, no changes are needed if tr.height() <= 3: return ' '.join(tr.pprint().split()) # if level is greater than tree's height, set it to tree's height, which means the most flattening if level > tr.height(): level = tr.height() #print 'tr is: ', tr for subt in tr.subtrees(): #print 'subt is: ', subt, 'height is: ', subt.height(), if subt.height() == level: #print 'yes!' if subt.parent(): subt.parent()[subt.parent_index()] = flatten2one(subt) else: tr = flatten2one(subt) break else: #print 'no!' continue return ' '.join(tr.pprint().split())
def test_get_head_ucp(self): self.assertEqual(nltk.ParentedTree("NN", ["trade"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(UCP (JJ economic) (CC and) (NN trade))")))
def test_get_head_vp(self): self.assertEqual(nltk.ParentedTree("VB", ["shoot"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(VP (VB shoot))")))
def test_get_head_x(self): self.assertEqual(nltk.ParentedTree(":", ["--"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(X (NNS Men) (CC or) (: --))")))
def test_get_head_intj(self): self.assertEqual(nltk.ParentedTree("UH", ["oh"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(INTJ (UH oh) (PRP$ my) (NNP god))")))
def test_get_head_adjp(self): self.assertEqual(nltk.ParentedTree("JJ" ,["twelfth"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADJP (JJ twelfth) (CC and) (JJ thirteenth))")))
def test_get_head_whnp(self): self.assertEqual(nltk.ParentedTree("WP", ["who"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHNP (WP who))")))
def test_get_head_advp(self): self.assertEqual(nltk.ParentedTree("RB", ["here"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(ADVP (RB here))")))
def test_get_head_qp(self): self.assertEqual(nltk.ParentedTree("CD", ["forty"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(QP (CD forty) (HYPH -) (CD five))")))
def test_get_head_whadvp(self): self.assertEqual(nltk.ParentedTree("WRB", ["how"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(WHADVP (WRB how))")))
def loadData(cls, filename): """ Return a list of Bead instances. Load trees, word alignments, and subtree alignments from a file, and create Bead instances. :type filename: str :param filename: a file with trees, word alignments, and subtree alignments for multiple sentence pairs. """ import codecs f = codecs.open(filename, 'r', 'utf-8') blocks = util.cleanData(f.read()).split('\n\n') f.close() beadList = [] srcTree = None tgtTree = None wordAlignment = None subtreeAlignment = [] for block in blocks[:-1]: block = block.split('\n') i = 0 errFlag = False while i < len(block): line = block[i] if line.startswith('SOURCE'): #print line if line[8:].startswith('ERROR'): errFlag = True break srcTree = nltk.ParentedTree(line[8:]) if srcTree.leaves() == []: errFlag = True break #print srcTree.leaves() i += 1 continue elif line.startswith('TARGET'): #print line tgtTree = nltk.ParentedTree(line[8:]) #print tgtTree #print tgtTree.leaves() if tgtTree.leaves() == []: errFlag = True break wordAlignment = [[ 0 for j in xrange(len(tgtTree.leaves())) ] for k in xrange(len(srcTree.leaves()))] i += 1 #print i continue elif line.startswith('<mapping>'): #print "in mapping..." i += 1 line = block[i] #print #print ' '.join([item.encode('utf-8') for item in srcTree.leaves()]) #print #print srcTree #print srcSubtreeIndex #print #print tgtTree #print tgtSubtreeIndex while not line.startswith('</mapping>'): x1 = int(line.split()[0].split(',')[0]) - 1 x2 = int(line.split()[0].split(',')[-1]) y1 = int(line.split()[1].split(',')[0]) - 1 y2 = int(line.split()[1].split(',')[-1]) subtreeAlignment.append((x1, y1, x2, y2)) i += 1 line = block[i] i += 1 continue elif line.startswith('<alignment>'): i += 1 line = block[i] #print len(wordAlignment), len(wordAlignment[0]) while not line.startswith('</alignment>'): #print line srcIndexes = [ int(item) - 1 for item in line.split()[0].split(',') if int(item) != -1 ] tgtIndexes = [ int(item) - 1 for item in line.split()[1].split(',') if int(item) != -1 ] for srcIndex in srcIndexes: for tgtIndex in tgtIndexes: wordAlignment[srcIndex][tgtIndex] = 1 i += 1 line = block[i] i += 1 continue elif line.startswith('</bead>'): break i += 1 if not errFlag: beadList.append( cls(srcTree, tgtTree, wordAlignment, subtreeAlignment, False, False, False, False, False)) srcTree, tgtTree, wordAlignment, subtreeAlignment = None, None, None, [] return beadList
def test_get_head_nml(self): self.assertEqual(nltk.ParentedTree("NN", ["curtain"]), self.head_finder.get_head(nltk.ParentedTree.fromstring("(NML (NN air) (NN curtain))")))