def test1_EasySRL_BoyGirl2(self): txt = r'''(<T S[dcl] 1 2> (<T NP 0 2> (<L NP/N DT DT The NP/N>) (<L N NN NN boy N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/(S[to]\NP) VB VB want (S[b]\NP)/(S[to]\NP)>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB believe (S[b]\NP)/NP>) (<T NP 0 2> (<L NP/N DT DT the NP/N>) (<L N NN NN girl N>) ) ) ) ) ) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) x = '[X1,E2,E3,X4| boy(X1),will(E2),_MODAL(E2),want(E2),_EVENT(E2),_ARG0(E2,X1),_ARG1(E2,E3),believe(E3),_EVENT(E3),_ARG0(E3,X1),_ARG1(E3,X4),girl(X4)]' self.assertEqual(x, s) a = get_constituents_string_list(ccg) dprint('\n'.join(a)) x = [ 'S(The boy #will want to believe the girl)', 'NP(#The boy)', 'S_INF(#want to believe the girl)', 'S_INF(#to believe the girl)', 'S_INF(#believe the girl)', 'NP(#the girl)' ] self.assertListEqual(x, a) s = get_constituent_string(ccg.get_verbnet_sentence()) self.assertEqual('NP(#The boy) VP(#will want) S_INF(#to believe) NP(#the girl)', s)
def test8_Wsj0004_3(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NN NN Compound N_309/N_309>) (<L N NNS NNS yields N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBP VBP assume (S[dcl]\NP_236)/NP_237>) (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<L N NN NN reinvestment N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_248\NP_248)/NP_249>) (<T NP 0 1> (<L N NNS NNS dividends N>) ) ) ) (<T NP[conj] 1 2> (<L conj CC CC and conj>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]_257>) (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_297/N_297>) (<T N 1 2> (<L N/N JJ JJ current N_292/N_292>) (<L N NN NN yield N>) ) ) (<T S[dcl]\NP 0 2> (<L S[dcl]\NP VBZ VBZ continues S[dcl]\NP_262>) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/NP IN IN for ((S_275\NP_270)_275\(S_275\NP_270)_275)/NP_276>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_283/N_283>) (<L N NN NN year N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() rule = get_rule(Category.from_cache('conj'), Category.from_cache('S[em]'), Category.from_cache('NP[conj]')) self.assertEqual(rule, RL_TC_ATOM) ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(compound, N/N, NN)', '<PushOp>:(yields, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(assume, (S[dcl]\\NP)/NP, VBP)', '<PushOp>:(reinvestment, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(dividends, N, NNS)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(current, N/N, JJ)', '<PushOp>:(yield, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(continue, S[dcl]\\NP, VBZ)', '<PushOp>:(for, ((S\\NP)\\(S\\NP))/NP, IN)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(year, N, NN)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA (S\\NP)\\(S\\NP))', '<ExecOp>:(2, BA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, ATOM_TC NP[conj])', '<ExecOp>:(2, RCONJ NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test2_GOLD_Wsj0051_13(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT The NP[nb]_273/N_273>) (<L N NNS NNS bids N>) ) (<T S[dcl]\NP 1 2> (<T (S\NP)/(S\NP) 1 2> (<L , , , , ,>) (<T (S\NP)/(S\NP) 0 2> (<T S[dcl]/S[dcl] 1 2> (<T S/(S\NP) 0 1> (<L NP PRP PRP he NP>) ) (<L (S[dcl]\NP)/S[dcl] VBD VBD added (S[dcl]\NP_242)/S[dcl]_243>) ) (<L , , , , ,>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[adj]\NP) VBD VBD were (S[dcl]\NP_211)/(S[adj]_212\NP_211:B)_212>) (<T S[adj]\NP 0 2> (<L (S[adj]\NP)/PP JJ JJ contrary (S[adj]\NP_219)/PP_220>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_225>) (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ common N_234/N_234>) (<L N NN NN sense N>) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(The #bids)', 'ADVP(he #added)', 'VP(#were)', 'ADJP(#contrary to common sense)', 'PP(#to)', 'NP(common #sense)' ] dprint('\n'.join(a)) self.assertListEqual(x, a)
def test2_GOLD_Wsj0003_1(self): # A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths # among a group of workers exposed to it more than 30 years ago, researchers reported. # ID=wsj_0003.1 PARSER=GOLD NUMPARSE=1 # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) # (<L N NN NN form N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) # (<T NP 0 1> # (<L N NN NN asbestos N>) # ) # ) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 1 2> # (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) # (<T S[to]\NP 0 2> # (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) # (<T S[b]\NP 0 2> # (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Kent N_222/N_222>) # (<T N 1 2> # (<L N/N NN NN cigarette N_215/N_215>) # (<L N NNS NNS filters N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) # (<T S[pt]\NP 0 2> # (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) # (<T N 1 2> # (<L N/N JJ JJ high N_41/N_41>) # (<L N NN NN percentage N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NN NN cancer N_64/N_64>) # (<L N NNS NNS deaths N>) # ) # ) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) # (<L N NN NN group N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) # (<T NP 0 2> # (<T NP 0 1> # (<L N NNS NNS workers N>) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 0 2> # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) # (<T PP 0 2> # (<L PP/NP TO TO to PP/NP_106>) # (<L NP PRP PRP it NP>) # ) # ) # (<T (S\NP)\(S\NP) 1 2> # (<T NP 0 1> # (<T N 1 2> # (<T N/N 1 2> # (<T (N/N)/(N/N) 1 2> # (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) # (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) # ) # (<L N/N CD CD 30 N_131/N_131>) # ) # (<L N NNS NNS years N>) # ) # ) # (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\S[dcl] 1 2> # (<L , , , , ,>) # (<T S[dcl]\S[dcl] 1 2> # (<T NP 0 1> # (<L N NNS NNS researchers N>) # ) # (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) (<L N NN NN form N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) (<T NP 0 1> (<L N NN NN asbestos N>) ) ) ) (<T NP\NP 0 1> (<T S[pss]\NP 1 2> (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Kent N_222/N_222>) (<T N 1 2> (<L N/N NN NN cigarette N_215/N_215>) (<L N NNS NNS filters N>) ) ) ) ) ) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) (<T S[pt]\NP 0 2> (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) (<T N 1 2> (<L N/N JJ JJ high N_41/N_41>) (<L N NN NN percentage N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) (<T NP 0 1> (<T N 1 2> (<L N/N NN NN cancer N_64/N_64>) (<L N NNS NNS deaths N>) ) ) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) (<L N NN NN group N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) (<T NP 0 2> (<T NP 0 1> (<L N NNS NNS workers N>) ) (<T NP\NP 0 1> (<T S[pss]\NP 0 2> (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_106>) (<L NP PRP PRP it NP>) ) ) (<T (S\NP)\(S\NP) 1 2> (<T NP 0 1> (<T N 1 2> (<T N/N 1 2> (<T (N/N)/(N/N) 1 2> (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) ) (<L N/N CD CD 30 N_131/N_131>) ) (<L N NNS NNS years N>) ) ) (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) ) ) ) ) ) ) ) ) ) ) ) (<T S[dcl]\S[dcl] 1 2> (<L , , , , ,>) (<T S[dcl]\S[dcl] 1 2> (<T NP 0 1> (<L N NNS NNS researchers N>) ) (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(A #form)', # 0 'PP(#of)', # 1 'NP(#asbestos)', # 2 'ADVP(once #used to make Kent cigarette filters)', # 3 'S_INF(#to make)', # 4 'NP(Kent cigarette #filters)', # 5 'VP(#has caused)', # 6 'NP(a high #percentage)', # 7 'PP(#of)', # 8 'NP(cancer #deaths)', # 9 'PP(#among)', #10 'NP(a #group)', #11 'PP(#of)', #12 'NP(#workers)', #13 'ADVP(#exposed to it more than 30 years ago)', #14 'NP(more than 30 #years)', #15 'NP(#researchers)', #16 'VP(#reported)', #17 ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 17 VP(reported.) # 06 VP(has caused) # 00 NP(A form) # 01 PP(of) # 02 NP(asbestos) # 03 ADVP(once used to make Kent cigarette filters) # 04 S_INF(to make) # 05 NP(Kent cigarette filters) # 07 NP(a high percentage) # 08 PP(of) # 09 NP(cancer deaths) # 10 PP(among) # 11 NP(a group) # 12 PP(of) # 13 NP(workers) # 14 ADVP(exposed to it more than 30 years ago) # 15 NP(more than 30 years) # 16 NP(reserchers) x = (17, [(6, [(0, [(1, [(2, [])]), (3, [(4, [(5, [])])])]), (7, [(8, [(9, [])]), (10, [(11, [(12, [(13, [(14, [(15, [])])])])])])])]), (16, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_2(self): # Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . # # PARG # 1 0 N/N 1 Vinken Mr. # 1 2 (S[dcl]\NP)/NP 1 Vinken is # 3 2 (S[dcl]\NP)/NP 2 chairman is # 3 4 (NP\NP)/NP 1 chairman of # 6 4 (NP\NP)/NP 2 N.V. of # 6 5 N/N 1 N.V. Elsevier # 11 4 (NP\NP)/NP 2 group of # 11 8 NP[nb]/N 1 group the # 11 9 N/N 1 group Dutch # 11 10 N/N 1 group publishing txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Mr. N_142/N_142>) (<L N NNP NNP Vinken N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ is (S[dcl]\NP_87)/NP_88>) (<T NP 0 2> (<T NP 0 1> (<L N NN NN chairman N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_99\NP_99)/NP_100>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Elsevier N_109/N_109>) (<L N NNP NNP N.V. N>) ) ) (<T NP[conj] 1 2> (<L , , , , ,>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_131/N_131>) (<T N 1 2> (<L N/N NNP NNP Dutch N_126/N_126>) (<T N 1 2> (<L N/N VBG VBG publishing N_119/N_119>) (<L N NN NN group N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(#Mr.-Vinken)', 'VP(#is)', 'NP(#chairman)', 'PP(#of)', 'NP(#Elsevier-N.V.)', 'NP(the Dutch publishing #group)', ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 01 VP(is) # 00 NP(Mr.-Vinken) # 02 NP(chairman) # 03 PP(of Elsevier N.V. the Dutch publishing group) # 04 NP(Elsevier N.V.) # 05 NP(the Dutch publishing group) x = (1, [(0, []), (2, [(3, [(4, [(5, [])])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_1(self): # ID=wsj_0001.1 PARSER=GOLD NUMPARSE=1 # Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Pierre N_73/N_73>) # (<L N NNP NNP Vinken N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 61 N_93/N_93>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) # (<T S[b]\NP 0 2> # (<T S[b]\NP 0 2> # (<T (S[b]\NP)/PP 0 2> # (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) # (<T NP 1 2> # (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) # (<L N NN NN board N>) # ) # ) # (<T PP 0 2> # (<L PP/NP IN IN as PP/NP_34>) # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_43/N_43>) # (<L N NN NN director N>) # ) # ) # ) # ) # (<T (S\NP)\(S\NP) 0 2> # (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) # (<L N[num] CD CD 29 N[num]>) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Pierre N_73/N_73>) (<L N NNP NNP Vinken N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 61 N_93/N_93>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) (<T S[b]\NP 0 2> (<T S[b]\NP 0 2> (<T (S[b]\NP)/PP 0 2> (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) (<L N NN NN board N>) ) ) (<T PP 0 2> (<L PP/NP IN IN as PP/NP_34>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_43/N_43>) (<L N NN NN director N>) ) ) ) ) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) (<L N[num] CD CD 29 N[num]>) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) # FIXME: VP(will #join) should be S_INF(will #join). # Issues occurs because I convert modal-verb combinator categories to modifiers. Must be fixed on functor # creation - Lexeme.get_production() # will: (S[dcl]\NP)/(S[b]/NP) -> (S\NP)/(S/NP) x = [ 'NP(#Pierre-Vinken)', 'ADJP(61 years #old)', 'NP(61 #years)', 'VP(#will join)', 'NP(the #board)', 'PP(#as)', 'NP(a nonexecutive #director)', 'NP(#Nov. 29)' ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 03 VP(will join) # 00 NP(Pierre-Vinken) # 01 ADJP(61 years old) # 02 NP(61 years) # 04 NP(the board) # 05 PP(as) # 06 NP(a nonexecutive director) # 07 NP(Nov. 29) x = (3, [(0, [(1, [(2, [])])]), (4, []), (5, [(6, [])]), (7, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0002_1(self): # ID=wsj_0002.1 PARSER=GOLD NUMPARSE=1 # Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive # director of this British industrial conglomerate. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Rudolph N_72/N_72>) # (<L N NNP NNP Agnew N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 0 2> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 55 N_92/N_92>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) # ) # (<T S[adj]\NP[conj] 1 2> # (<L conj CC CC and conj>) # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N JJ JJ former N_102/N_102>) # (<L N NN NN chairman N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Consolidated N_135/N_135>) # (<T N 1 2> # (<L N/N NNP NNP Gold N_128/N_128>) # (<T N 1 2> # (<L N/N NNP NNP Fields N_121/N_121>) # (<L N NNP NNP PLC N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) # (<T NP 0 2> (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_28/N_28>) # (<L N NN NN director N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) # (<T NP 1 2> # (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) # (<T N 1 2> # (<L N/N JJ JJ British N_58/N_58>) # (<T N 1 2> # (<L N/N JJ JJ industrial N_51/N_51>) # (<L N NN NN conglomerate N>) # ) # ) # ) # ) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Rudolph N_72/N_72>) (<L N NNP NNP Agnew N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 0 2> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 55 N_92/N_92>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) ) (<T S[adj]\NP[conj] 1 2> (<L conj CC CC and conj>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ former N_102/N_102>) (<L N NN NN chairman N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Consolidated N_135/N_135>) (<T N 1 2> (<L N/N NNP NNP Gold N_128/N_128>) (<T N 1 2> (<L N/N NNP NNP Fields N_121/N_121>) (<L N NNP NNP PLC N>) ) ) ) ) ) ) ) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_28/N_28>) (<L N NN NN director N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) (<T NP 1 2> (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) (<T N 1 2> (<L N/N JJ JJ British N_58/N_58>) (<T N 1 2> (<L N/N JJ JJ industrial N_51/N_51>) (<L N NN NN conglomerate N>) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) dprint('\n'.join(a)) # Hash indicates head word in constituent x = [ 'NP(#Rudolph-Agnew)', 'ADJP(55 years #old and former chairman of Consolidated-Gold-Fields-PLC)', 'NP(55 #years)', 'NP(former #chairman)', 'PP(#of)', 'NP(#Consolidated-Gold-Fields-PLC)', 'VP(#was named)', 'NP(a nonexecutive #director)', 'PP(#of)', 'NP(this British industrial #conglomerate)' ] self.assertListEqual(x, a) # 6 VP(was named) # 0 NP(Rudolph-Agnew) # 1 ADVP(55 years old former chairman of Consolidated-Gold-Fields-PLC) # 2 NP(55 years) # 3 NP(former chairman) # 4 PP(of) # 5 NP(Consolidated-Gold-Fields-PLC) # 7 NP(a nonexecutive director) # 8 PP(of) # 9 NP(this British industrial conglomerate) x = (6, [(0, [(1, [(2, []), (3, [(4, [(5, [])])])])]), (7, [(8, [(9, [])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test8_RuleUniquenessEasySRL(self): allfiles = [] projdir = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))))) ldcpath = os.path.join(projdir, 'data', 'ldc', 'easysrl', 'ccgbank') dirlist1 = os.listdir(ldcpath) for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 ambiguous = [] start = 0 for fn in allfiles: with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines), 50): start = 0 ccgbank = lines[i] dprint('%s-%04d' % (name, i)) ccg = Ccg2Drs() try: pt = parse_ccg_derivation(ccgbank) ccg.build_execution_sequence(pt) except Exception: failed_parse += 1 continue self.assertIsNotNone(pt) for op in ccg.exeque: if isinstance(op, PushOp): continue self.assertIsInstance(op, ExecOp) left = op.sub_ops[0].category result = op.category if len(op.sub_ops) == 2: right = op.sub_ops[1].category else: right = CAT_EMPTY exclude = [] # Should not have ambiguity rule = get_rule(left, right, result, exclude) if rule is None and right != CAT_EMPTY: rule = get_rule(left.remove_features(), right.remove_features(), result.remove_features(), exclude) self.assertIsNotNone(rule) rstr = '' while rule is not None: rstr += repr(rule) + '|' rule = get_rule(left, right, result, exclude) if len(exclude) > 1: ambiguous.append( ('%s <- %s <{%s}> %s' % (result, left, rstr, right), exclude)) for x in ambiguous: dprint('ambiguous rule in %s-%04d: %s {%s}' % x) self.assertTrue(len(ambiguous) == 0)
def test6_Wsj0051_13(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT The NP[nb]_273/N_273>) (<L N NNS NNS bids N>) ) (<T S[dcl]\NP 1 2> (<T (S\NP)/(S\NP) 1 2> (<L , , , , ,>) (<T (S\NP)/(S\NP) 0 2> (<T S[dcl]/S[dcl] 1 2> (<T S/(S\NP) 0 1> (<L NP PRP PRP he NP>) ) (<L (S[dcl]\NP)/S[dcl] VBD VBD added (S[dcl]\NP_242)/S[dcl]_243>) ) (<L , , , , ,>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[adj]\NP) VBD VBD were (S[dcl]\NP_211)/(S[adj]_212\NP_211:B)_212>) (<T S[adj]\NP 0 2> (<L (S[adj]\NP)/PP JJ JJ contrary (S[adj]\NP_219)/PP_220>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_225>) (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ common N_234/N_234>) (<L N NN NN sense N>) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(bids, N, NNS)', '<ExecOp>:(2, FA NP)', '<PushOp>:(,, ,, ,)', '<PushOp>:(he, NP, PRP)', '<ExecOp>:(1, TR S/(S\\NP))', '<PushOp>:(add, (S[dcl]\\NP)/S[dcl], VBD)', '<ExecOp>:(2, FC S[dcl]/S[dcl])', '<PushOp>:(,, ,, ,)', '<ExecOp>:(2, L_UNARY_TC (S\\NP)/(S\\NP))', '<ExecOp>:(2, RP (S\\NP)/(S\\NP))', '<PushOp>:(be, (S[dcl]\\NP)/(S[adj]\\NP), VBD)', '<PushOp>:(contrary, (S[adj]\\NP)/PP, JJ)', '<PushOp>:(to, PP/NP, TO)', '<PushOp>:(common, N/N, JJ)', '<PushOp>:(sense, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA PP)', '<ExecOp>:(2, FA S[adj]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test5_EasySRL_04_1850(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<L NP/N DT DT The NP/N>) (<T N 1 2> (<L N/N NN NN investment N/N>) (<T N 0 2> (<L N NN NN community N>) (<L , , , , ,>) ) ) ) (<T S[dcl]\NP 1 2> (<T (S\NP)/(S\NP) 0 2> (<L (S\NP)/(S\NP) RB RB however (S\NP)/(S\NP)>) (<T ((S\NP)/(S\NP))\((S\NP)/(S\NP)) 1 2> (<L , , , , ,>) (<L (S\NP)/(S\NP) RB RB strongly (S\NP)/(S\NP)>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/S[em] VBZ VBZ believes (S[dcl]\NP)/S[em]>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]>) (<T S[dcl] 1 2> (<T NP 0 2> (<L NP/N DT DT the NP/N>) (<L N NN NN strike N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<T S[b]\NP 0 2> (<L (S[b]\NP)/(S[pss]\NP) VB VB be (S[b]\NP)/(S[pss]\NP)>) (<L S[pss]\NP VBN VBN settled S[pss]\NP>) ) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/S[dcl] IN IN before ((S\NP)\(S\NP))/S[dcl]>) (<T S[dcl] 1 2> (<L NP[thr] EX EX there NP[thr]>) (<T S[dcl]\NP[thr] 0 2> (<L (S[dcl]\NP[thr])/NP VBZ VBZ is (S[dcl]\NP[thr])/NP>) (<T NP 0 2> (<T NP 0 2> (<L NP/N DT DT any NP/N>) (<T N 1 2> (<L N/N JJ JJ lasting N/N>) (<T N 0 2> (<L N/PP NN NN effect N/PP>) (<T PP 0 2> (<L PP/NP IN IN on PP/NP>) (<T NP 1 2> (<L NP/NP CC CC either NP/NP>) (<T NP 0 1> (<L N NNP NNP Boeing N>) ) ) ) ) ) ) (<T NP\NP 1 2> (<L conj CC CC or conj>) (<T NP 0 2> (<L NP/(N/PP) PRP$ PRP$ its NP/(N/PP)>) (<T N/PP 1 2> (<L N/N NN NN work N/N>) (<L N/PP NN NN force N/PP>) ) ) ) ) ) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(the, NP/N, DT)', '<PushOp>:(investment, N/N, NN)', '<PushOp>:(community, N, NN)', '<PushOp>:(,, ,, ,)', '<ExecOp>:(2, LP N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(however, (S\\NP)/(S\\NP), RB)', '<PushOp>:(,, ,, ,)', '<PushOp>:(strongly, (S\\NP)/(S\\NP), RB)', '<ExecOp>:(2, R_UNARY_TC ((S\\NP)/(S\\NP))\((S\\NP)/(S\\NP)))', '<ExecOp>:(2, BA (S\\NP)/(S\\NP))', '<PushOp>:(believe, (S[dcl]\\NP)/S[em], VBZ)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(the, NP/N, DT)', '<PushOp>:(strike, N, NN)', '<ExecOp>:(2, FA NP)', '<PushOp>:(will, (S\\NP)/(S\\NP), MD)', '<PushOp>:(be, (S[b]\\NP)/(S[pss]\\NP), VB)', '<PushOp>:(settle, S[pss]\\NP, VBN)', '<ExecOp>:(2, FA S[b]\\NP)', '<PushOp>:(before, ((S\\NP)\(S\\NP))/S[dcl], IN)', '<PushOp>:(there, NP[thr], EX)', '<PushOp>:(be, (S[dcl]\\NP[thr])/NP, VBZ)', '<PushOp>:(any, NP/N, DT)', '<PushOp>:(lasting, N/N, JJ)', '<PushOp>:(effect, N/PP, NN)', '<PushOp>:(on, PP/NP, IN)', '<PushOp>:(either, NP/NP, CC)', '<PushOp>:(Boeing, N, NNP)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA PP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(or, conj, CC)', '<PushOp>:(its, NP/(N/PP), PRP$)', '<PushOp>:(work, N/N, NN)', '<PushOp>:(force, N/PP, NN)', '<ExecOp>:(2, FC N/PP)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, R_UNARY_TC NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, FA S[dcl]\\NP[thr])', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA (S\\NP)\\(S\\NP))', '<ExecOp>:(2, BA S[b]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test4_Wsj0999_11(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<L N NNS NNS People N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN on (NP_159\NP_159)/NP_160>) (<T NP 0 1> (<T N 1 2> (<L N/N VBN VBN fixed N_169/N_169>) (<L N NNS NNS incomes N>) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBP VBP get (S[dcl]\NP_128)/NP_129>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_136/N_136>) (<L N NN NN break N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN at (NP_144\NP_144)/NP_145>) (<T NP 0 1> (<L N NNP NNP Espre N>) ) ) ) ) ) (<T S[dcl][conj] 1 2> (<L ; ; : ; ;>) (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N IN IN over N_248/N_248>) (<L N CD CD 55 N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ NNS wins (S[dcl]\NP_177)/NP_178>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_206/N_206>) (<T N 1 2> (<T N/N 1 2> (<L (N/N)/(N/N) CD CD 45 (N_201/N_195)_201/(N_201/N_195)_201>) (<L N/N NN NN % N_187/N_187>) ) (<L N NN NN discount N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN at (NP_214\NP_214)/NP_215>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Anaheim N_238/N_238>) (<T N 1 2> (<L N/N NNP NNP Imperial N_231/N_231>) (<T N 1 2> (<L N/N NNP NNP Health N_224/N_224>) (<L N NNP NNP Spa N>) ) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(people, N, NNS)', '<ExecOp>:(1, LP NP)', '<PushOp>:(on, (NP\\NP)/NP, IN)', '<PushOp>:(fix, N/N, VBN)', '<PushOp>:(incomes, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(get, (S[dcl]\\NP)/NP, VBP)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(break, N, NN)', '<ExecOp>:(2, FA NP)', '<PushOp>:(at, (NP\\NP)/NP, IN)', '<PushOp>:(Espre, N, NNP)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(;, ;, ;)', '<PushOp>:(over, N/N, IN)', '<PushOp>:(55, N, CD)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(win, (S[dcl]\\NP)/NP, VBZ)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(45, (N/N)/(N/N), CD)', '<PushOp>:(%, N/N, NN)', '<ExecOp>:(2, FA N/N)', '<PushOp>:(discount, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(at, (NP\\NP)/NP, IN)', '<PushOp>:(Anaheim, N/N, NNP)', '<PushOp>:(Imperial, N/N, NNP)', '<PushOp>:(Health, N/N, NNP)', '<PushOp>:(Spa, N, NNP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, RP S[dcl][conj])', '<ExecOp>:(2, RCONJ S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test3_Wsj0002_1(self): # Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive # director of this British industrial conglomerate. txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Rudolph N_72/N_72>) (<L N NNP NNP Agnew N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 0 2> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 55 N_92/N_92>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) ) (<T S[adj]\NP[conj] 1 2> (<L conj CC CC and conj>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ former N_102/N_102>) (<L N NN NN chairman N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Consolidated N_135/N_135>) (<T N 1 2> (<L N/N NNP NNP Gold N_128/N_128>) (<T N 1 2> (<L N/N NNP NNP Fields N_121/N_121>) (<L N NNP NNP PLC N>) ) ) ) ) ) ) ) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_28/N_28>) (<L N NN NN director N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) (<T NP 1 2> (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) (<T N 1 2> (<L N/N JJ JJ British N_58/N_58>) (<T N 1 2> (<L N/N JJ JJ industrial N_51/N_51>) (<L N NN NN conglomerate N>) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) if future_string == unicode: pt_old = parse_ccg_derivation_old(txt) else: pt_old = pt_to_utf8(parse_ccg_derivation_old(txt), True) actual = repr(pt) expected = repr(pt_old) self.assertEquals(expected, actual) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(Rudolph, N/N, NNP)', '<PushOp>:(Agnew, N, NNP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(,, ,, ,)', '<ExecOp>:(2, LP NP)', '<PushOp>:(55, N/N, CD)', '<PushOp>:(years, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(old, (S[adj]\\NP)\\NP, JJ)', '<ExecOp>:(2, BA S[adj]\\NP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(former, N/N, JJ)', '<PushOp>:(chairman, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(Consolidated, N/N, NNP)', '<PushOp>:(Gold, N/N, NNP)', '<PushOp>:(Fields, N/N, NNP)', '<PushOp>:(PLC, N, NNP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, CONJ_TC S[adj]\\NP[conj])', '<ExecOp>:(2, RCONJ S[adj]\\NP)', '<ExecOp>:(1, L_UNARY_TC NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(,, ,, ,)', '<ExecOp>:(2, LP NP)', '<PushOp>:(be, (S[dcl]\\NP)/(S[pss]\\NP), VBD)', '<PushOp>:(name, (S[pss]\\NP)/NP, VBN)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(nonexecutive, N/N, JJ)', '<PushOp>:(director, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(this, NP[nb]/N, DT)', '<PushOp>:(british, N/N, JJ)', '<PushOp>:(industrial, N/N, JJ)', '<PushOp>:(conglomerate, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, FA S[pss]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test1_Wsj0001_2(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Mr. N_107/N_107>) (<L N NNP NNP Vinken N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ is (S[dcl]\NP_112)/NP_113>) (<T NP 0 2> (<T NP 0 1> (<L N NN NN chairman N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_109\NP_109)/NP_110>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Elsevier N_107/N_107>) (<L N NNP NNP N.V. N>) ) ) (<T NP[conj] 1 2> (<L , , , , ,>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_48/N_48>) (<T N 1 2> (<L N/N NNP NNP Dutch N_107/N_107>) (<T N 1 2> (<L N/N VBG VBG publishing N_107/N_107>) (<L N NN NN group N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(Mr, N/N, NNP)', '<PushOp>:(Vinken, N, NNP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(be, (S[dcl]\\NP)/NP, VBZ)', '<PushOp>:(chairman, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(Elsevier, N/N, NNP)', '<PushOp>:(N.V, N, NNP)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(,, ,, ,)', '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(Dutch, N/N, NNP)', '<PushOp>:(publish, N/N, VBG)', '<PushOp>:(group, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, RP NP[conj])', '<ExecOp>:(2, RCONJ NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual) # Check ccgbank generation txt2 = '\n' + ccg.get_predarg_ccgbank(pretty=True) self.assertEquals(txt, txt2) # Check lexicon expected = [ 'Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.' ] actual = [x.word for x in ccg.lexque] self.assertListEqual(expected, actual) # Check dependencies self.assertEquals(ccg.lexque[0].head, 1) # Mr. -> Vinken self.assertEquals(ccg.lexque[1].head, 2) # Vinken -> is self.assertEquals(ccg.lexque[2].head, 2) # root self.assertEquals(ccg.lexque[3].head, 2) # chairman -> is self.assertEquals(ccg.lexque[4].head, 3) # of -> chairman self.assertEquals(ccg.lexque[5].head, 6) # Elsevier -> N.V. self.assertEquals(ccg.lexque[6].head, 4) # N.V. -> of self.assertEquals(ccg.lexque[8].head, 11) # the -> group self.assertEquals(ccg.lexque[9].head, 11) # Dutch -> group self.assertEquals(ccg.lexque[10].head, 11) # publishing -> group self.assertEquals(ccg.lexque[11].head, 6) # group -> N.V
def test2_Wsj0037_37(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<T N/N 0 2> (<L N/N JJR JJR More N_134/N_134>) (<T N/N[conj] 1 2> (<L conj CC CC and conj>) (<L N/N JJR JJR more N_141/N_141>) ) ) (<L N NNS NNS corners N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_152\NP_152)/NP_153>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_160/N_160>) (<L N NN NN globe N>) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[ng]\NP) VBP VBP are (S[dcl]\NP_91)/(S[ng]_92\NP_91:B)_92>) (<T S[ng]\NP 0 2> (<L (S[ng]\NP)/(S[adj]\NP) VBG VBG becoming (S[ng]\NP_101)/(S[adj]_102\NP_101:B)_102>) (<T S[adj]\NP 0 2> (<L (S[adj]\NP)/PP JJ JJ free (S[adj]\NP_109)/PP_110>) (<T PP 0 2> (<L PP/NP IN IN of PP/NP_115>) (<T NP 0 1> (<T N 1 2> (<L N/N NN NN tobacco N_124/N_124>) (<L N NN NN smoke N>) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(more, N/N, JJR)', '<PushOp>:(and, conj, CC)', '<PushOp>:(more, N/N, JJR)', '<ExecOp>:(2, RP N/N[conj])', '<ExecOp>:(2, RCONJ N/N)', '<PushOp>:(corners, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(globe, N, NN)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(be, (S[dcl]\\NP)/(S[ng]\\NP), VBP)', '<PushOp>:(become, (S[ng]\\NP)/(S[adj]\\NP), VBG)', '<PushOp>:(free, (S[adj]\\NP)/PP, JJ)', '<PushOp>:(of, PP/NP, IN)', '<PushOp>:(tobacco, N/N, NN)', '<PushOp>:(smoke, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA PP)', '<ExecOp>:(2, FA S[adj]\\NP)', '<ExecOp>:(2, FA S[ng]\\NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual) # Check lexicon expected = [ 'More', 'and', 'more', 'corners', 'of', 'the', 'globe', 'are', 'becoming', 'free', 'of', 'tobacco', 'smoke', '.' ] actual = [x.word for x in ccg.lexque] self.assertListEqual(expected, actual) # Check dependencies self.assertEquals(ccg.lexque[0].head, 3) # More -> corners self.assertEquals(ccg.lexque[2].head, 0) # more -> More self.assertEquals(ccg.lexque[3].head, 7) # corners -> are self.assertEquals(ccg.lexque[4].head, 3) # of -> corners self.assertEquals(ccg.lexque[5].head, 6) # the -> globe self.assertEquals(ccg.lexque[6].head, 4) # globe -> of self.assertEquals(ccg.lexque[7].head, 7) # root self.assertEquals(ccg.lexque[8].head, 7) # becoming -> are self.assertEquals(ccg.lexque[9].head, 8) # free -> becoming self.assertEquals(ccg.lexque[10].head, 9) # of -> free self.assertEquals(ccg.lexque[11].head, 12) # tobacco -> smoke self.assertEquals(ccg.lexque[12].head, 10) # smoke -> of
def test9_RuleExecutionEasySRL(self): allfiles = [] projdir = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))))) ldcpath = os.path.join(projdir, 'data', 'ldc', 'easysrl', 'ccgbank') dirlist1 = os.listdir(ldcpath) for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_exec = [] start = 0 analysis = [] for fn in allfiles[0:]: with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines), 50): #for i in range(start, len(lines)): start = 0 ccgbank = lines[i] dprint('%s-%04d' % (name, i)) ccg = Ccg2Drs() try: pt = parse_ccg_derivation(ccgbank) ccg.build_execution_sequence(pt) except Exception: failed_parse += 1 continue for op in ccg.exeque: if isinstance(op, PushOp): continue self.assertIsInstance(op, ExecOp) left = op.sub_ops[0].category.remove_wildcards() result = op.category.remove_wildcards() if len(op.sub_ops) == 2: right = op.sub_ops[1].category.remove_wildcards() else: right = CAT_EMPTY if op.rule is not None and op.rule not in [RL_TCL_UNARY, RL_TCR_UNARY, RL_TC_ATOM, RL_TC_CONJ, \ RL_LPASS, RL_RPASS, RL_TYPE_RAISE]: actual = op.rule.apply_rule_to_category(left, right) if not actual.can_unify(result): failed_exec.append('%s-%04d: %s <!> %s' % (name, i, actual, result)) failed_exec.append('%s <- %s %s %s' % (actual, left, op.rule, right)) failed_exec.append(ccgbank) else: # Can add analysis here if left == CAT_NP and right == CAT_NP_NP and op.rule == RL_BA and op.head != 0: # Expected: NP(x) <- λx.NP(x) -BA- λxλy.NP(y)\NP(x) # Actual: NP(y) <- λx.NP(x) -BA- λxλy.NP(y)\NP(x) analysis.append('%s-%04d: NP <ba> NP\\NP' % (name, i)) self.assertTrue(actual.can_unify(result)) if len(analysis) != 0: dprint('-----------------------') print('%d rules failed analysis') dprint('--') for ln in analysis: dprint(ln) if len(failed_exec) != 0: dprint('-----------------------') print('%d rules failed exec' % len(failed_exec) / 3) dprint('--') for ln in failed_exec: dprint(ln) self.assertTrue(len(failed_exec) == 0) self.assertTrue(len(analysis) == 0)
def test7_Wsj0051_30(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N NNP NNP Fujitsu N>) (<T N[conj] 1 2> (<L conj CC CC and conj>) (<L N NNP NNP NEC N>) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/S[dcl] VBD VBD said (S[dcl]\NP_146)/S[dcl]_147>) (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<L NP PRP PRP they NP>) (<T S[dcl]\NP 0 2> (<T (S[dcl]\NP)/(S[ng]\NP) 0 2> (<L (S[dcl]\NP)/(S[ng]\NP) VBD VBD were (S[dcl]\NP_156)/(S[ng]_157\NP_156:B)_157>) (<L (S\NP)\(S\NP) RB RB still (S_169\NP_164)_169\(S_169\NP_164)_169>) ) (<L S[ng]\NP VBG VBG investigating S[ng]\NP_174>) ) ) (<T S[dcl][conj] 1 2> (<L , , , , ,>) (<T S[dcl][conj] 1 2> (<L conj CC CC and conj>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]_181>) (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<L N NN NN knowledge N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_207\NP_207)/NP_208>) (<T NP 0 1> (<T N 1 2> (<L N/N JJR JJR more N_224/N_224>) (<T N 1 2> (<L N/N JJ JJ such N_217/N_217>) (<L N NNS NNS bids N>) ) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD could (S[dcl]\NP_190)/(S[b]_191\NP_190:B)_191>) (<L S[b]\NP VB VB emerge S[b]\NP_196>) ) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() rule = get_rule(Category.from_cache('conj'), Category.from_cache('S[em]'), Category.from_cache('S[dcl][conj]')) self.assertEqual(rule, RL_RPASS) ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(Fujitsu, N, NNP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(NEC, N, NNP)', '<ExecOp>:(2, RP N[conj])', '<ExecOp>:(2, RCONJ N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(say, (S[dcl]\\NP)/S[dcl], VBD)', '<PushOp>:(they, NP, PRP)', '<PushOp>:(be, (S[dcl]\\NP)/(S[ng]\\NP), VBD)', '<PushOp>:(still, (S\\NP)\\(S\\NP), RB)', '<ExecOp>:(2, BX (S[dcl]\\NP)/(S[ng]\\NP))', '<PushOp>:(investigate, S[ng]\\NP, VBG)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(,, ,, ,)', '<PushOp>:(and, conj, CC)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(knowledge, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(more, N/N, JJR)', '<PushOp>:(such, N/N, JJ)', '<PushOp>:(bids, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(could, (S\\NP)/(S\\NP), MD)', '<PushOp>:(emerge, S[b]\\NP, VB)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, RP S[dcl][conj])', '<ExecOp>:(2, RP S[dcl][conj])', '<ExecOp>:(2, RCONJ S[dcl])', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])' ] self.assertListEqual(expected, actual)
def build_from_ldc_ccgbank(fn_dict, outdir, verbose=False, verify=True): print('Building function templates from LDC ccgbank...') allfiles = [] ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'AUTO') dirlist1 = os.listdir(ldcpath) for dir1 in dirlist1: ldcpath1 = os.path.join(ldcpath, dir1) if os.path.isdir(ldcpath1): dirlist2 = os.listdir(ldcpath1) for dir2 in dirlist2: ldcpath2 = os.path.join(ldcpath1, dir2) if os.path.isfile(ldcpath2): allfiles.append(ldcpath2) failed_parse = [] failed_rules = [] rules = [] progress = 0 for fn in allfiles: progress = print_progress(progress, 10) with open(fn, 'r') as fd: lines = fd.readlines() for hdr,ccgbank in zip(lines[0::2], lines[1::2]): pt = None try: pt = parse_ccg_derivation(ccgbank) extract_predarg_categories_from_pt(pt, rules) except Exception as e: failed_parse.append(safe_utf8_encode('CCGBANK: ' + ccgbank.strip())) failed_parse.append(safe_utf8_encode('Error: %s' % e)) # Now attempt to track undefined unary rules if pt is not None: try: builder = Ccg2Drs() builder.build_execution_sequence(pt) # Calling this will track undefined builder.get_predarg_ccgbank() except Exception as e: pass progress = (progress / 10) * 1000 for predarg in rules: progress = print_progress(progress, 1000) try: catkey = predarg.clean(True) template = FunctorTemplate.create_from_category(predarg) if template is None: continue if catkey.signature not in fn_dict: fn_dict[catkey.signature] = template elif verify: f1 = fn_dict[catkey.signature] t1 = future_string(f1) t2 = future_string(template) assert t1 == t2, 'verify failed\n t1=%s\n t2=%s\n f1=%s\n f2=%s' % (t1, t2, f1.predarg_category, predarg) except Exception as e: failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e))) # DEBUG ? if False: try: FunctorTemplate.create_from_category(predarg) except Exception: pass print_progress(progress, done=True) if len(failed_parse) != 0: print('Warning: ldc - %d parses failed' % (len(failed_parse)/2)) with open(os.path.join(outdir, 'parse_ccg_derivation_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_parse)) if verbose: for x, m in failed_parse: print(m) if len(failed_rules) != 0: print('Warning: ldc - %d rules failed' % len(failed_rules)) with open(os.path.join(outdir, 'functor_ldc_templates_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_rules)) if verbose: for m in failed_rules: print(m) return fn_dict
def test7_RuleUniquenessLDC(self): allfiles = [] projdir = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(__file__))))))) ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'AUTO') dirlist1 = os.listdir(ldcpath) for dir1 in dirlist1: ldcpath1 = os.path.join(ldcpath, dir1) if os.path.isdir(ldcpath1): dirlist2 = os.listdir(ldcpath1) for dir2 in dirlist2: ldcpath2 = os.path.join(ldcpath1, dir2) if os.path.isfile(ldcpath2): allfiles.append(ldcpath2) failed_parse = 0 ambiguous = [] for fn in allfiles: with open(fn, 'r') as fd: lines = fd.readlines() for hdr, ccgbank in zip(lines[0::10], lines[1::10]): dprint(hdr.strip()) ccg = Ccg2Drs() try: pt = parse_ccg_derivation(ccgbank) ccg.build_execution_sequence(pt) except Exception: failed_parse += 1 continue self.assertIsNotNone(pt) for op in ccg.exeque: if isinstance(op, PushOp): continue self.assertIsInstance(op, ExecOp) left = op.sub_ops[0].category result = op.category if len(op.sub_ops) == 2: right = op.sub_ops[1].category else: right = CAT_EMPTY exclude = [] # Should not have ambiguity rule = get_rule(left, right, result, exclude) limit = 5 rstr = '' while rule is not None: rstr += repr(rule) + '|' rule = get_rule(left, right, result, exclude) limit -= 1 if limit == 0: rule = get_rule(left, right, result, exclude) break if len(exclude) > 1: ambiguous.append( ('%s <- %s <{%s}> %s' % (result, left, rstr, right), exclude)) self.assertGreater(limit, 0) for x in ambiguous: dprint('ambiguous rule: %s {%s}' % x) self.assertTrue(len(ambiguous) == 0)