def test1_PP_Attachment(self): # NCCG get the PP attachment wrong txt = "Eat spaghetti with meatballs" derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sent.get_drs() s = d.show(SHOW_LINEAR) dprint(s) a = get_constituents_string_list(sent) dprint('\n'.join(a)) x = [ 'S_INF(#Eat spaghetti with meatballs)', # 0 'NP(#spaghetti)', # 1 'NP(#meatballs)', # 2 ] self.assertListEqual(x, a) x = (0, [(1, []), (2, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a)) vsent = get_constituent_string(sent.get_verbnet_sentence()) self.assertEqual('S_INF(#Eat with) NP(#spaghetti) NP(#meatballs)', vsent)
def test1_EasySRL_BoyGirl2(self): txt = r'''(<T S[dcl] 1 2> (<T NP 0 2> (<L NP/N DT DT The NP/N>) (<L N NN NN boy N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/(S[to]\NP) VB VB want (S[b]\NP)/(S[to]\NP)>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB believe (S[b]\NP)/NP>) (<T NP 0 2> (<L NP/N DT DT the NP/N>) (<L N NN NN girl N>) ) ) ) ) ) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) x = '[X1,E2,E3,X4| boy(X1),will(E2),_MODAL(E2),want(E2),_EVENT(E2),_ARG0(E2,X1),_ARG1(E2,E3),believe(E3),_EVENT(E3),_ARG0(E3,X1),_ARG1(E3,X4),girl(X4)]' self.assertEqual(x, s) a = get_constituents_string_list(ccg) dprint('\n'.join(a)) x = [ 'S(The boy #will want to believe the girl)', 'NP(#The boy)', 'S_INF(#want to believe the girl)', 'S_INF(#to believe the girl)', 'S_INF(#believe the girl)', 'NP(#the girl)' ] self.assertListEqual(x, a) s = get_constituent_string(ccg.get_verbnet_sentence()) self.assertEqual('NP(#The boy) VP(#will want) S_INF(#to believe) NP(#the girl)', s)
def test2_Wsj_0056_1(self): # RAW 1043 txt = '''@''' derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sent.get_drs() s = d.show(SHOW_LINEAR) dprint(s) a = get_constituents_string_list(sent) dprint('\n'.join(a)) x = ['S(#@)'] self.assertListEqual(x, a)
def make_drs(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'drs') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = os.listdir(ldcpath) for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg2drs = [] start = 0 progress = -1 for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') if not os.path.exists(os.path.join(easysrl_path, idx)): os.mkdir(os.path.join(easysrl_path, idx)) with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() pccg = pt_to_ccg_derivation(pt) except Exception: failed_parse += 1 raise continue try: d = process_ccg_pt( pt, CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH).get_drs() assert d is not None assert isinstance(d, DRS) d = d.show(SHOW_LINEAR).strip() except Exception as e: print(e) failed_ccg2drs.append((name, i, ccgbank)) raise continue with open( os.path.join(easysrl_path, idx, 'drs_%s_%04d.dat' % (idx, i)), 'w') as fd: fd.write(b'<sentence>\n') fd.write(safe_utf8_encode(s)) fd.write(b'\n</sentence>\n<drs>\n') fd.write(safe_utf8_encode(d)) fd.write(b'\n</drs>\n<predarg>\n') fd.write(safe_utf8_encode(pccg)) fd.write(b'\n') fd.write(b'</predarg>\n') if failed_parse != 0: print('%d derivations failed to parse' % failed_parse) if len(failed_ccg2drs) != 0: print('%d derivations failed to convert to DRS' % len(failed_ccg2drs)) for x in failed_ccg2drs: print('%s-%04d failed: {%s}' % x)
def test2_GOLD_Wsj0051_13(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT The NP[nb]_273/N_273>) (<L N NNS NNS bids N>) ) (<T S[dcl]\NP 1 2> (<T (S\NP)/(S\NP) 1 2> (<L , , , , ,>) (<T (S\NP)/(S\NP) 0 2> (<T S[dcl]/S[dcl] 1 2> (<T S/(S\NP) 0 1> (<L NP PRP PRP he NP>) ) (<L (S[dcl]\NP)/S[dcl] VBD VBD added (S[dcl]\NP_242)/S[dcl]_243>) ) (<L , , , , ,>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[adj]\NP) VBD VBD were (S[dcl]\NP_211)/(S[adj]_212\NP_211:B)_212>) (<T S[adj]\NP 0 2> (<L (S[adj]\NP)/PP JJ JJ contrary (S[adj]\NP_219)/PP_220>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_225>) (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ common N_234/N_234>) (<L N NN NN sense N>) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(The #bids)', 'ADVP(he #added)', 'VP(#were)', 'ADJP(#contrary to common sense)', 'PP(#to)', 'NP(common #sense)' ] dprint('\n'.join(a)) self.assertListEqual(x, a)
def test2_GOLD_Wsj0003_1(self): # A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths # among a group of workers exposed to it more than 30 years ago, researchers reported. # ID=wsj_0003.1 PARSER=GOLD NUMPARSE=1 # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) # (<L N NN NN form N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) # (<T NP 0 1> # (<L N NN NN asbestos N>) # ) # ) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 1 2> # (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) # (<T S[to]\NP 0 2> # (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) # (<T S[b]\NP 0 2> # (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Kent N_222/N_222>) # (<T N 1 2> # (<L N/N NN NN cigarette N_215/N_215>) # (<L N NNS NNS filters N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) # (<T S[pt]\NP 0 2> # (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) # (<T N 1 2> # (<L N/N JJ JJ high N_41/N_41>) # (<L N NN NN percentage N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NN NN cancer N_64/N_64>) # (<L N NNS NNS deaths N>) # ) # ) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) # (<L N NN NN group N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) # (<T NP 0 2> # (<T NP 0 1> # (<L N NNS NNS workers N>) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 0 2> # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) # (<T PP 0 2> # (<L PP/NP TO TO to PP/NP_106>) # (<L NP PRP PRP it NP>) # ) # ) # (<T (S\NP)\(S\NP) 1 2> # (<T NP 0 1> # (<T N 1 2> # (<T N/N 1 2> # (<T (N/N)/(N/N) 1 2> # (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) # (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) # ) # (<L N/N CD CD 30 N_131/N_131>) # ) # (<L N NNS NNS years N>) # ) # ) # (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\S[dcl] 1 2> # (<L , , , , ,>) # (<T S[dcl]\S[dcl] 1 2> # (<T NP 0 1> # (<L N NNS NNS researchers N>) # ) # (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) (<L N NN NN form N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) (<T NP 0 1> (<L N NN NN asbestos N>) ) ) ) (<T NP\NP 0 1> (<T S[pss]\NP 1 2> (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Kent N_222/N_222>) (<T N 1 2> (<L N/N NN NN cigarette N_215/N_215>) (<L N NNS NNS filters N>) ) ) ) ) ) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) (<T S[pt]\NP 0 2> (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) (<T N 1 2> (<L N/N JJ JJ high N_41/N_41>) (<L N NN NN percentage N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) (<T NP 0 1> (<T N 1 2> (<L N/N NN NN cancer N_64/N_64>) (<L N NNS NNS deaths N>) ) ) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) (<L N NN NN group N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) (<T NP 0 2> (<T NP 0 1> (<L N NNS NNS workers N>) ) (<T NP\NP 0 1> (<T S[pss]\NP 0 2> (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_106>) (<L NP PRP PRP it NP>) ) ) (<T (S\NP)\(S\NP) 1 2> (<T NP 0 1> (<T N 1 2> (<T N/N 1 2> (<T (N/N)/(N/N) 1 2> (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) ) (<L N/N CD CD 30 N_131/N_131>) ) (<L N NNS NNS years N>) ) ) (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) ) ) ) ) ) ) ) ) ) ) ) (<T S[dcl]\S[dcl] 1 2> (<L , , , , ,>) (<T S[dcl]\S[dcl] 1 2> (<T NP 0 1> (<L N NNS NNS researchers N>) ) (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(A #form)', # 0 'PP(#of)', # 1 'NP(#asbestos)', # 2 'ADVP(once #used to make Kent cigarette filters)', # 3 'S_INF(#to make)', # 4 'NP(Kent cigarette #filters)', # 5 'VP(#has caused)', # 6 'NP(a high #percentage)', # 7 'PP(#of)', # 8 'NP(cancer #deaths)', # 9 'PP(#among)', #10 'NP(a #group)', #11 'PP(#of)', #12 'NP(#workers)', #13 'ADVP(#exposed to it more than 30 years ago)', #14 'NP(more than 30 #years)', #15 'NP(#researchers)', #16 'VP(#reported)', #17 ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 17 VP(reported.) # 06 VP(has caused) # 00 NP(A form) # 01 PP(of) # 02 NP(asbestos) # 03 ADVP(once used to make Kent cigarette filters) # 04 S_INF(to make) # 05 NP(Kent cigarette filters) # 07 NP(a high percentage) # 08 PP(of) # 09 NP(cancer deaths) # 10 PP(among) # 11 NP(a group) # 12 PP(of) # 13 NP(workers) # 14 ADVP(exposed to it more than 30 years ago) # 15 NP(more than 30 years) # 16 NP(reserchers) x = (17, [(6, [(0, [(1, [(2, [])]), (3, [(4, [(5, [])])])]), (7, [(8, [(9, [])]), (10, [(11, [(12, [(13, [(14, [(15, [])])])])])])])]), (16, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_2(self): # Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . # # PARG # 1 0 N/N 1 Vinken Mr. # 1 2 (S[dcl]\NP)/NP 1 Vinken is # 3 2 (S[dcl]\NP)/NP 2 chairman is # 3 4 (NP\NP)/NP 1 chairman of # 6 4 (NP\NP)/NP 2 N.V. of # 6 5 N/N 1 N.V. Elsevier # 11 4 (NP\NP)/NP 2 group of # 11 8 NP[nb]/N 1 group the # 11 9 N/N 1 group Dutch # 11 10 N/N 1 group publishing txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Mr. N_142/N_142>) (<L N NNP NNP Vinken N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ is (S[dcl]\NP_87)/NP_88>) (<T NP 0 2> (<T NP 0 1> (<L N NN NN chairman N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_99\NP_99)/NP_100>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Elsevier N_109/N_109>) (<L N NNP NNP N.V. N>) ) ) (<T NP[conj] 1 2> (<L , , , , ,>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_131/N_131>) (<T N 1 2> (<L N/N NNP NNP Dutch N_126/N_126>) (<T N 1 2> (<L N/N VBG VBG publishing N_119/N_119>) (<L N NN NN group N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(#Mr.-Vinken)', 'VP(#is)', 'NP(#chairman)', 'PP(#of)', 'NP(#Elsevier-N.V.)', 'NP(the Dutch publishing #group)', ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 01 VP(is) # 00 NP(Mr.-Vinken) # 02 NP(chairman) # 03 PP(of Elsevier N.V. the Dutch publishing group) # 04 NP(Elsevier N.V.) # 05 NP(the Dutch publishing group) x = (1, [(0, []), (2, [(3, [(4, [(5, [])])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_1(self): # ID=wsj_0001.1 PARSER=GOLD NUMPARSE=1 # Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Pierre N_73/N_73>) # (<L N NNP NNP Vinken N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 61 N_93/N_93>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) # (<T S[b]\NP 0 2> # (<T S[b]\NP 0 2> # (<T (S[b]\NP)/PP 0 2> # (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) # (<T NP 1 2> # (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) # (<L N NN NN board N>) # ) # ) # (<T PP 0 2> # (<L PP/NP IN IN as PP/NP_34>) # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_43/N_43>) # (<L N NN NN director N>) # ) # ) # ) # ) # (<T (S\NP)\(S\NP) 0 2> # (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) # (<L N[num] CD CD 29 N[num]>) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Pierre N_73/N_73>) (<L N NNP NNP Vinken N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 61 N_93/N_93>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) (<T S[b]\NP 0 2> (<T S[b]\NP 0 2> (<T (S[b]\NP)/PP 0 2> (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) (<L N NN NN board N>) ) ) (<T PP 0 2> (<L PP/NP IN IN as PP/NP_34>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_43/N_43>) (<L N NN NN director N>) ) ) ) ) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) (<L N[num] CD CD 29 N[num]>) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) # FIXME: VP(will #join) should be S_INF(will #join). # Issues occurs because I convert modal-verb combinator categories to modifiers. Must be fixed on functor # creation - Lexeme.get_production() # will: (S[dcl]\NP)/(S[b]/NP) -> (S\NP)/(S/NP) x = [ 'NP(#Pierre-Vinken)', 'ADJP(61 years #old)', 'NP(61 #years)', 'VP(#will join)', 'NP(the #board)', 'PP(#as)', 'NP(a nonexecutive #director)', 'NP(#Nov. 29)' ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 03 VP(will join) # 00 NP(Pierre-Vinken) # 01 ADJP(61 years old) # 02 NP(61 years) # 04 NP(the board) # 05 PP(as) # 06 NP(a nonexecutive director) # 07 NP(Nov. 29) x = (3, [(0, [(1, [(2, [])])]), (4, []), (5, [(6, [])]), (7, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0002_1(self): # ID=wsj_0002.1 PARSER=GOLD NUMPARSE=1 # Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive # director of this British industrial conglomerate. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Rudolph N_72/N_72>) # (<L N NNP NNP Agnew N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 0 2> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 55 N_92/N_92>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) # ) # (<T S[adj]\NP[conj] 1 2> # (<L conj CC CC and conj>) # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N JJ JJ former N_102/N_102>) # (<L N NN NN chairman N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Consolidated N_135/N_135>) # (<T N 1 2> # (<L N/N NNP NNP Gold N_128/N_128>) # (<T N 1 2> # (<L N/N NNP NNP Fields N_121/N_121>) # (<L N NNP NNP PLC N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) # (<T NP 0 2> (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_28/N_28>) # (<L N NN NN director N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) # (<T NP 1 2> # (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) # (<T N 1 2> # (<L N/N JJ JJ British N_58/N_58>) # (<T N 1 2> # (<L N/N JJ JJ industrial N_51/N_51>) # (<L N NN NN conglomerate N>) # ) # ) # ) # ) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Rudolph N_72/N_72>) (<L N NNP NNP Agnew N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 0 2> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 55 N_92/N_92>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) ) (<T S[adj]\NP[conj] 1 2> (<L conj CC CC and conj>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ former N_102/N_102>) (<L N NN NN chairman N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Consolidated N_135/N_135>) (<T N 1 2> (<L N/N NNP NNP Gold N_128/N_128>) (<T N 1 2> (<L N/N NNP NNP Fields N_121/N_121>) (<L N NNP NNP PLC N>) ) ) ) ) ) ) ) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_28/N_28>) (<L N NN NN director N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) (<T NP 1 2> (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) (<T N 1 2> (<L N/N JJ JJ British N_58/N_58>) (<T N 1 2> (<L N/N JJ JJ industrial N_51/N_51>) (<L N NN NN conglomerate N>) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) dprint('\n'.join(a)) # Hash indicates head word in constituent x = [ 'NP(#Rudolph-Agnew)', 'ADJP(55 years #old and former chairman of Consolidated-Gold-Fields-PLC)', 'NP(55 #years)', 'NP(former #chairman)', 'PP(#of)', 'NP(#Consolidated-Gold-Fields-PLC)', 'VP(#was named)', 'NP(a nonexecutive #director)', 'PP(#of)', 'NP(this British industrial #conglomerate)' ] self.assertListEqual(x, a) # 6 VP(was named) # 0 NP(Rudolph-Agnew) # 1 ADVP(55 years old former chairman of Consolidated-Gold-Fields-PLC) # 2 NP(55 years) # 3 NP(former chairman) # 4 PP(of) # 5 NP(Consolidated-Gold-Fields-PLC) # 7 NP(a nonexecutive director) # 8 PP(of) # 9 NP(this British industrial conglomerate) x = (6, [(0, [(1, [(2, []), (3, [(4, [(5, [])])])])]), (7, [(8, [(9, [])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def make_lexicon(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) if not os.path.exists(os.path.join(easysrl_path, 'rt')): os.makedirs(os.path.join(easysrl_path, 'rt')) if not os.path.exists(os.path.join(easysrl_path, 'az')): os.makedirs(os.path.join(easysrl_path, 'az')) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = sorted(os.listdir(ldcpath)) #dirlist1 = ['ccg_derivation00.txt'] for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg_derivation = [] start = 0 progress = -1 dictionary = None for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. ccgbank = safe_utf8_decode(ccgbank) pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() except Exception: failed_parse += 1 raise continue uid = '%s-%04d' % (idx, i) try: #dictionary[0-25][stem][set([c]), set(uid)] dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid) except Exception as e: print(e) raise continue rtdict = {} for idx in range(len(dictionary)): fname = unichr(idx + 0x40) filepath = os.path.join(easysrl_path, 'az', fname + '.txt') with open(filepath, 'w') as fd: d = dictionary[idx] for k, v in d.iteritems(): # k == stem, v = {c: set(uid)} fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k)) for x, w in v.iteritems(): fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x)) nc = x.split(':') if len(nc) == 2: c = Category.from_cache( Category(nc[1].strip()).clean(True)) # Return type atom rt = c.extract_unify_atoms(False)[-1] if rt in rtdict: cdict = rtdict[rt] if c in cdict: cdict[c].append(nc[0]) else: cdict[c] = [nc[0]] else: rtdict[rt] = {c: [nc[0]]} for y in w: fd.write(b'sentence id: ' + safe_utf8_encode(y)) fd.write(b'\n') fd.write(b'</usage>\n') fd.write(b'</predicate>\n\n') # Free up memory dictionary[idx] = None d = None for rt, cdict in rtdict.iteritems(): fname = rt.signature.replace('[', '_').replace(']', '') filepath = os.path.join(easysrl_path, 'rt', fname + '.txt') with open(filepath, 'w') as fd: for c, vs in cdict.iteritems(): fd.write(b'<category signature=\'%s\'>\n' % safe_utf8_encode(c)) for v in vs: fd.write(v) fd.write(b'\n') fd.write(b'</category>\n\n')