def test1_JsonFiles(self): filelist = os.listdir(datapath) allfiles = [] for fn in filelist: if not os.path.isfile(os.path.join(datapath, fn)): continue f, x = os.path.splitext(fn) if x == '.json' and f == '9255a890ffe40c05876d8d402044ab11': allfiles.append(os.path.join(datapath, fn)) for fn in allfiles: with open(fn, 'r') as fd: body = json.load(fd, encoding='utf-8') smod = preprocess_sentence(body['title']) ccgbank = grpc.ccg_parse(self.stub, smod, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(ccgbank) ccg = process_ccg_pt(pt) ccgbody = {} ccgbody['story'] = { 'title': [x.get_json() for x in ccg.get_span()], 'paragraphs': [] } paragraphs = filter( lambda y: len(y) != 0, map(lambda x: x.strip(), body['content'].split('\n'))) i = 0 for p in paragraphs[i:]: sentences = filter(lambda x: len(x.strip()) != 0, sent_tokenize(p)) sp = [] j = 0 for s in sentences[j:]: dprint('p:s = %d:%d' % (i, j)) smod = preprocess_sentence(s) ccgbank = grpc.ccg_parse(self.stub, smod, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(ccgbank) ccg = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) sp.append([x.get_json() for x in ccg.get_span()]) j += 1 ccgbody['story']['paragraphs'].append(sp) i += 1 msgbody = json.dumps(ccgbody) pass
def test10_OrOfVerb_OrInBrackets(self): text = "That which is perceived or known or inferred to have its own distinct existence (living or nonliving)" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs(nodups=True) dprint(pt_to_ccg_derivation(pt)) dprint(d) # RT_EMPTY_DRS adds 'or' to phrases f = sentence.select_phrases(lambda x: x.pos is POS.from_cache('WDT') or \ 0 == (x.mask & RT_EMPTY_DRS), contiguous=False) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('That which' in phrases) self.assertTrue('have' in phrases) self.assertTrue('is perceived known inferred' in phrases) self.assertTrue('its own distinct existence' in phrases) verb1 = filter(lambda x: 'is perceived known inferred' == x[1].text, f.iteritems())[0] verb2 = filter(lambda x: 'have' == x[1].text, f.iteritems())[0] agent = filter(lambda x: 'That which' == x[1].text, f.iteritems())[0] theme = filter(lambda x: 'its own distinct existence' == x[1].text, f.iteritems())[0] X1 = agent[0] E1 = verb1[0] E2 = verb2[0] X2 = theme[1][0].refs[1] X3 = theme[1][1].refs[0] self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None) self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None) self.assertTrue(d.find_condition(Rel('_ARG1', [E1, E2])) is not None) # TODO: should the theme attach to X2? self.assertTrue(d.find_condition(Rel('_ARG1', [E2, X3])) is not None) self.assertTrue(d.find_condition(Rel('_POSS', [X2, X3])) is not None)
def test01_AndOfSubj(self): text = "John and Paul went to the movies" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('John' in phrases) self.assertTrue('Paul' in phrases) self.assertTrue('went' in phrases) john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0] paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0] went = filter(lambda x: 'went' == x[1].text, f.iteritems())[0] J = john[0] P = paul[0] E = went[0] self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None) self.assertTrue(d.find_condition(Rel('go', [E])) is not None) self.assertTrue(d.find_condition(Rel('John', [J])) is not None) self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None) self.assertTrue(d.find_condition(Rel('_ARG0', [E, J])) is not None)
def test1_PP_Attachment(self): # NCCG get the PP attachment wrong txt = "Eat spaghetti with meatballs" derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sent.get_drs() s = d.show(SHOW_LINEAR) dprint(s) a = get_constituents_string_list(sent) dprint('\n'.join(a)) x = [ 'S_INF(#Eat spaghetti with meatballs)', # 0 'NP(#spaghetti)', # 1 'NP(#meatballs)', # 2 ] self.assertListEqual(x, a) x = (0, [(1, []), (2, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a)) vsent = get_constituent_string(sent.get_verbnet_sentence()) self.assertEqual('S_INF(#Eat with) NP(#spaghetti) NP(#meatballs)', vsent)
def test03_OrOfObj(self): text = "To participate in games or sport" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.select_phrases(RT_ENTITY | RT_EVENT) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('participate' in phrases) self.assertTrue('games' in phrases) self.assertTrue('sport' in phrases) noun1 = filter(lambda x: 'games' == x[1].text, f.iteritems())[0] noun2 = filter(lambda x: 'sport' == x[1].text, f.iteritems())[0] verb = filter(lambda x: 'participate' == x[1].text, f.iteritems())[0] X1 = noun1[0] X2 = noun2[0] E = verb[0] self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None) self.assertTrue(d.find_condition(Rel('participate', [E])) is not None) self.assertTrue(d.find_condition(Rel('games', [X1])) is not None) self.assertTrue(d.find_condition(Rel('sport', [X2])) is not None) self.assertTrue(d.find_condition(Rel('_ARG1', [E, X2])) is not None)
def test1_EasySRL_BoyGirl2(self): txt = r'''(<T S[dcl] 1 2> (<T NP 0 2> (<L NP/N DT DT The NP/N>) (<L N NN NN boy N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/(S[to]\NP) VB VB want (S[b]\NP)/(S[to]\NP)>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP)/(S[b]\NP)>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB believe (S[b]\NP)/NP>) (<T NP 0 2> (<L NP/N DT DT the NP/N>) (<L N NN NN girl N>) ) ) ) ) ) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) x = '[X1,E2,E3,X4| boy(X1),will(E2),_MODAL(E2),want(E2),_EVENT(E2),_ARG0(E2,X1),_ARG1(E2,E3),believe(E3),_EVENT(E3),_ARG0(E3,X1),_ARG1(E3,X4),girl(X4)]' self.assertEqual(x, s) a = get_constituents_string_list(ccg) dprint('\n'.join(a)) x = [ 'S(The boy #will want to believe the girl)', 'NP(#The boy)', 'S_INF(#want to believe the girl)', 'S_INF(#to believe the girl)', 'S_INF(#believe the girl)', 'NP(#the girl)' ] self.assertListEqual(x, a) s = get_constituent_string(ccg.get_verbnet_sentence()) self.assertEqual('NP(#The boy) VP(#will want) S_INF(#to believe) NP(#the girl)', s)
def test05_AndOfVerb_AndOfObj(self): text = "Bell makes and distributes computers, electronics, and building products" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT | RT_ATTRIBUTE) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('Bell' in phrases) self.assertTrue('makes distributes' in phrases) self.assertTrue('computers' in phrases) self.assertTrue('electronics' in phrases) # Note if we add RT_EMPTY_DRS to the selection criteria then this phrase becomes 'and building products' self.assertTrue('building products' in phrases) self.assertEqual(5, len(phrases)) verb1 = filter(lambda x: 'makes distributes' == x[1].text, f.iteritems())[0] agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0] theme1 = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0] theme2 = filter(lambda x: 'electronics' == x[1].text, f.iteritems())[0] theme3 = filter(lambda x: 'building products' == x[1].text, f.iteritems())[0] X1 = agent[0] Y1 = theme1[0] Y2 = theme2[0] Y3 = theme3[0] E1 = verb1[0] self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None) self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None) # TODO: should we add proposition for multi NP's conjoined? self.assertTrue(d.find_condition(Rel('_ARG1', [E1, Y3])) is not None)
def test02_AndOfObj(self): text = "He saw John and Paul" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('John' in phrases) self.assertTrue('Paul' in phrases) self.assertTrue('saw' in phrases) john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0] paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0] saw = filter(lambda x: 'saw' == x[1].text, f.iteritems())[0] J = john[0] P = paul[0] E = saw[0] # FIXME: wn lemmatizer does not convert saw to see - I guess to to ambiguity self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None) self.assertTrue(d.find_condition(Rel('saw', [E])) is not None) self.assertTrue(d.find_condition(Rel('John', [J])) is not None) self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None) self.assertTrue(d.find_condition(Rel('_ARG1', [E, J])) is not None)
def test8_Wsj0004_3(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NN NN Compound N_309/N_309>) (<L N NNS NNS yields N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBP VBP assume (S[dcl]\NP_236)/NP_237>) (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<L N NN NN reinvestment N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_248\NP_248)/NP_249>) (<T NP 0 1> (<L N NNS NNS dividends N>) ) ) ) (<T NP[conj] 1 2> (<L conj CC CC and conj>) (<T S[em] 0 2> (<L S[em]/S[dcl] IN IN that S[em]/S[dcl]_257>) (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_297/N_297>) (<T N 1 2> (<L N/N JJ JJ current N_292/N_292>) (<L N NN NN yield N>) ) ) (<T S[dcl]\NP 0 2> (<L S[dcl]\NP VBZ VBZ continues S[dcl]\NP_262>) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/NP IN IN for ((S_275\NP_270)_275\(S_275\NP_270)_275)/NP_276>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_283/N_283>) (<L N NN NN year N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) ccg = Ccg2Drs() rule = get_rule(Category.from_cache('conj'), Category.from_cache('S[em]'), Category.from_cache('NP[conj]')) self.assertEqual(rule, RL_TC_ATOM) ccg.build_execution_sequence(pt) # Check execution queue actual = [repr(x) for x in ccg.exeque] expected = [ '<PushOp>:(compound, N/N, NN)', '<PushOp>:(yields, N, NNS)', '<ExecOp>:(2, FA N)', '<ExecOp>:(1, LP NP)', '<PushOp>:(assume, (S[dcl]\\NP)/NP, VBP)', '<PushOp>:(reinvestment, N, NN)', '<ExecOp>:(1, LP NP)', '<PushOp>:(of, (NP\\NP)/NP, IN)', '<PushOp>:(dividends, N, NNS)', '<ExecOp>:(1, LP NP)', '<ExecOp>:(2, FA NP\\NP)', '<ExecOp>:(2, BA NP)', '<PushOp>:(and, conj, CC)', '<PushOp>:(that, S[em]/S[dcl], IN)', '<PushOp>:(the, NP[nb]/N, DT)', '<PushOp>:(current, N/N, JJ)', '<PushOp>:(yield, N, NN)', '<ExecOp>:(2, FA N)', '<ExecOp>:(2, FA NP)', '<PushOp>:(continue, S[dcl]\\NP, VBZ)', '<PushOp>:(for, ((S\\NP)\\(S\\NP))/NP, IN)', '<PushOp>:(a, NP[nb]/N, DT)', '<PushOp>:(year, N, NN)', '<ExecOp>:(2, FA NP)', '<ExecOp>:(2, FA (S\\NP)\\(S\\NP))', '<ExecOp>:(2, BA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<ExecOp>:(2, FA S[em])', '<ExecOp>:(2, ATOM_TC NP[conj])', '<ExecOp>:(2, RCONJ NP)', '<ExecOp>:(2, FA S[dcl]\\NP)', '<ExecOp>:(2, BA S[dcl])', '<PushOp>:(., ., .)', '<ExecOp>:(2, LP S[dcl])', ] self.assertListEqual(expected, actual)
def test1_Currency_00_0194(self): text = r"Without the Cray-3 research and development expenses, the company would have been able to report a profit of $19.3 million for the first half of 1989 rather than the $5.9 million it posted." etext = r"Without the Cray-3 research and development expenses , the company would have been able to report a profit of $ 19.3 million for the first half of 1989 rather than the $ 5.9 million it posted" mtext = preprocess_sentence(text) self.assertEqual(etext, mtext) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs(nodups=True) dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] self.assertTrue('the Cray-3 research and development expenses' in nps) self.assertTrue('the company' in nps) self.assertTrue('a profit' in nps) self.assertTrue('$ 19.3 million' in nps) self.assertTrue('the first half' in nps) self.assertTrue('the $ 5.9 million' in nps) self.assertTrue('1989' in nps) fvps = sentence.get_vp_nominals() vps = [sp.text for r, sp in fvps] self.assertTrue('would have been' in vps) self.assertTrue('report' in vps) self.assertTrue('posted' in vps) would_have_been = filter(lambda x: 'would have been' == x[1].text, fvps)[0][0] report = filter(lambda x: 'report' == x[1].text, fvps)[0][0] posted = filter(lambda x: 'posted' == x[1].text, fvps)[0][0] cray_rnd = filter( lambda x: 'the Cray-3 research and development expenses' == x[1]. text, fnps)[0][0] company = filter(lambda x: 'the company' == x[1].text, fnps)[0][0] profit = filter(lambda x: 'a profit' == x[1].text, fnps)[0][0] first_half = filter(lambda x: 'the first half' == x[1].text, fnps)[0][0] n1989 = filter(lambda x: '1989' == x[1].text, fnps)[0][0] n19_3M = filter(lambda x: '$ 19.3 million' == x[1].text, fnps)[0][0] n5_9M = filter(lambda x: 'the $ 5.9 million' == x[1].text, fnps)[0][0] self.assertTrue( d.find_condition(Rel('without', [would_have_been, cray_rnd])) is not None) self.assertTrue( d.find_condition(Rel('_ARG0', [would_have_been, company])) is not None) self.assertTrue( d.find_condition(Rel('_ARG0', [report, company])) is not None) self.assertTrue( d.find_condition(Rel('_ARG1', [report, profit])) is not None) self.assertTrue( d.find_condition(Rel('of', [profit, n19_3M])) is not None) self.assertTrue( d.find_condition(Rel('for', [profit, first_half])) is not None) self.assertTrue( d.find_condition(Rel('of', [first_half, n1989])) is not None) self.assertTrue( d.find_condition(Rel('_ARG1', [posted, n5_9M])) is not None)
def test2_Wsj_0056_1(self): # RAW 1043 txt = '''@''' derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sent.get_drs() s = d.show(SHOW_LINEAR) dprint(s) a = get_constituents_string_list(sent) dprint('\n'.join(a)) x = ['S(#@)'] self.assertListEqual(x, a)
def test2_Date_21_0985(self): text = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis: 1989 and Wednesday October 4, 1989." etext = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis : 1989 and Wednesday October 4 , 1989" mtext = preprocess_sentence(text) self.assertEqual(etext, mtext) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] self.assertTrue('Annualized interest rates' in nps) self.assertTrue('certain investments' in nps) self.assertTrue('the Federal-Reserve-Board' in nps) self.assertTrue('a weekly-average basis' in nps) self.assertTrue('Wednesday October 4' in nps)
def test2_Date_00_1228(self): text = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15" etext = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15" mtext = preprocess_sentence(text) self.assertEqual(etext, mtext) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] self.assertTrue('The reduced dividend' in nps) self.assertTrue('payable' in nps) self.assertTrue('Jan. 2' in nps) self.assertTrue('Dec. 15' in nps) self.assertTrue('stock' in nps) self.assertTrue('record' in nps)
def test3_ApposInterrupt(self): text = r"Robbie, a hot-tempered tennis player, charged the umpire and tried to crack the poor man's skull with a racket." mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.get_np_nominals() phrases = [sp.text for r, sp in f] self.assertTrue('Robbie' in phrases) self.assertTrue('a hot-tempered tennis player' in phrases) robbie = filter(lambda x: 'Robbie' == x[1].text, f)[0] temper = filter(lambda x: 'a hot-tempered tennis player' == x[1].text, f)[0] X = robbie[0] Y = temper[0] self.assertNotEqual(X, Y) self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None) self.assertTrue(len(repr(d).split('_AKA')) == 2)
def test4_ApposInterrupt(self): text = r"Bell, a telecommunications company, which is located in Los Angeles, makes and distributes electronics, computers, and building products" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.get_np_nominals() phrases = [sp.text for r, sp in f] self.assertTrue('Bell' in phrases) self.assertTrue('a telecommunications company' in phrases) np1 = filter(lambda x: 'Bell' == x[1].text, f)[0] np2 = filter(lambda x: 'a telecommunications company' == x[1].text, f)[0] X = np1[0] Y = np2[0] self.assertNotEqual(X, Y) self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None) self.assertTrue(len(repr(d).split('_AKA')) == 2)
def test1_Currency_00_0195(self): text = r"On the other hand, had it existed then, Cray Computer would have incurred a $20.5 million loss." etext = r"On the other hand , had it existed then , Cray Computer would have incurred a $ 20.5 million loss ." mtext = preprocess_sentence(text) self.assertEqual(etext, mtext) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] self.assertTrue('the other hand' in nps) self.assertTrue('Cray-Computer' in nps) self.assertTrue('$ 20.5 million' in nps) fvps = sentence.get_vp_nominals() vps = [sp.text for r, sp in fvps] self.assertTrue('had' in vps) self.assertTrue('existed' in vps) self.assertTrue('would have incurred' in vps)
def test2_ApposInterrupt(self): text = r"Reliable, Diane's eleven-year-old beagle, chews holes in the living room carpeting as if he were still a puppy." mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.get_np_nominals() phrases = [sp.text for r, sp in f] self.assertTrue('Reliable' in phrases) self.assertTrue("eleven-year-old beagle" in phrases) self.assertTrue("Diane" in phrases) dog = filter(lambda x: 'Reliable' == x[1].text, f)[0] breed = filter(lambda x: "eleven-year-old beagle" == x[1].text, f)[0] X = dog[0] Y = breed[0] self.assertNotEqual(X, Y) self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None) self.assertTrue(len(repr(d).split('_AKA')) == 2)
def test10_Brutus(self): text = "Ceasar was stabbed by Brutus" derivation = grpc.ccg_parse(self.stub, text, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] #self.assertTrue('Average maturity' in nps) self.assertTrue('Brutus' in nps) self.assertTrue('Ceasar' in nps) fvps = sentence.get_vp_nominals() vps = [sp.text for r, sp in fvps] self.assertTrue('was stabbed' in vps) E = filter(lambda x: x[1].text == "was stabbed", fvps)[0][0] A1 = filter(lambda x: x[1].text == "Brutus", fnps)[0][0] A0 = filter(lambda x: x[1].text == "Ceasar", fnps)[0][0] self.assertTrue(d.find_condition(Rel('_ARG0', [E, A0])) is not None) self.assertTrue(d.find_condition(Rel('_ARG1', [E, A1])) is not None)
def test10_Ccgbank_00_0036(self): text = "Average maturity of the funds' investments lengthened by a day to 41 days, the longest since early August, according to Donoghue's." etext = "Average maturity of the funds ' investments lengthened by a day to 41 days , the longest since early August , according to Donoghue 's ." mtext = preprocess_sentence(text) self.assertEqual(etext, mtext) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) fnps = sentence.get_np_nominals() nps = [sp.text for r, sp in fnps] #self.assertTrue('Average maturity' in nps) self.assertTrue('the funds' in nps) self.assertTrue('a day' in nps) self.assertTrue('41 days' in nps) self.assertTrue('the longest' in nps) self.assertTrue('early August' in nps) fvps = sentence.get_vp_nominals() vps = [sp.text for r, sp in fvps] self.assertTrue('lengthened' in vps) self.assertTrue('according' in vps)
def test04_AndOfVerb(self): text = "Bell makes and distributes computers" mtext = preprocess_sentence(text) derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(derivation) sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH) d = sentence.get_drs() dprint(pt_to_ccg_derivation(pt)) dprint(d) f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT) phrases = [sp.text for r, sp in f.iteritems()] self.assertTrue('Bell' in phrases) self.assertTrue('makes distributes' in phrases) self.assertTrue('computers' in phrases) verb1 = filter(lambda x: 'makes distributes' == x[1].text, f.iteritems())[0] agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0] theme = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0] X1 = agent[0] X2 = theme[0] E1 = verb1[0] self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None) self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None) self.assertTrue(d.find_condition(Rel('_ARG1', [E1, X2])) is not None)
lc = len(lnout) lnout.append(ln.strip()) if mm not in wsjd: lnout.append('ERR: cannot find mapping to %s' % mm) total_err += 1 continue gold_derivation = wsjd[mm] e_sentence = None n_sentence = None options = CO_NO_VERBNET | CO_NO_WIKI_SEARCH | CO_VARNAMES_MATCH_WORD_INDEX try: if estub is not None: ed = grpc.ccg_parse(estub, ln) ept = parse_ccg_derivation(ed) e_sentence = process_ccg_pt(ept, options) if nstub is not None: nd = grpc.ccg_parse(nstub, ln) npt = parse_ccg_derivation(nd) n_sentence = process_ccg_pt(npt, options) gpt = parse_ccg_derivation(gold_derivation) gold_sentence = process_ccg_pt(gpt, options) except UnaryRuleError as e: lnout.append('ERR: %s' % e) total_err += 1 continue except Exception as e: lnout.append('ERR: %s' % e)
def build_from_ldc_ccgbank(fn_dict, outdir, verbose=False, verify=True): print('Building function templates from LDC ccgbank...') allfiles = [] ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'AUTO') dirlist1 = os.listdir(ldcpath) for dir1 in dirlist1: ldcpath1 = os.path.join(ldcpath, dir1) if os.path.isdir(ldcpath1): dirlist2 = os.listdir(ldcpath1) for dir2 in dirlist2: ldcpath2 = os.path.join(ldcpath1, dir2) if os.path.isfile(ldcpath2): allfiles.append(ldcpath2) failed_parse = [] failed_rules = [] rules = [] progress = 0 for fn in allfiles: progress = print_progress(progress, 10) with open(fn, 'r') as fd: lines = fd.readlines() for hdr,ccgbank in zip(lines[0::2], lines[1::2]): pt = None try: pt = parse_ccg_derivation(ccgbank) extract_predarg_categories_from_pt(pt, rules) except Exception as e: failed_parse.append(safe_utf8_encode('CCGBANK: ' + ccgbank.strip())) failed_parse.append(safe_utf8_encode('Error: %s' % e)) # Now attempt to track undefined unary rules if pt is not None: try: builder = Ccg2Drs() builder.build_execution_sequence(pt) # Calling this will track undefined builder.get_predarg_ccgbank() except Exception as e: pass progress = (progress / 10) * 1000 for predarg in rules: progress = print_progress(progress, 1000) try: catkey = predarg.clean(True) template = FunctorTemplate.create_from_category(predarg) if template is None: continue if catkey.signature not in fn_dict: fn_dict[catkey.signature] = template elif verify: f1 = fn_dict[catkey.signature] t1 = future_string(f1) t2 = future_string(template) assert t1 == t2, 'verify failed\n t1=%s\n t2=%s\n f1=%s\n f2=%s' % (t1, t2, f1.predarg_category, predarg) except Exception as e: failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e))) # DEBUG ? if False: try: FunctorTemplate.create_from_category(predarg) except Exception: pass print_progress(progress, done=True) if len(failed_parse) != 0: print('Warning: ldc - %d parses failed' % (len(failed_parse)/2)) with open(os.path.join(outdir, 'parse_ccg_derivation_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_parse)) if verbose: for x, m in failed_parse: print(m) if len(failed_rules) != 0: print('Warning: ldc - %d rules failed' % len(failed_rules)) with open(os.path.join(outdir, 'functor_ldc_templates_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_rules)) if verbose: for m in failed_rules: print(m) return fn_dict
def run(self): """Process messages.""" for message in receive_messages(self.aws.news_queue, MessageAttributeNames=['All']): global _logger # Attributes will be passed onto next queue attributes = message.message_attributes mhash = attributes['hash']['StringValue'] _logger.debug('Received news_queue(%s) -> hash(%s)', message.message_id, mhash) body = json.loads(message.body) retry = 3 ccgbank = None title = body['title'] paragraphs_in = filter(lambda y: len(y) != 0, map(lambda x: x.strip(), body['content'].split('\n'))) paragraphs_out = [] if len(paragraphs_in) == 0: _logger.debug('No paragraphs for story %s\n%s', (mhash, title)) # Use NLTK to split paragraphs into sentences. for p in paragraphs_in: sentences = filter(lambda x: len(x.strip()) != 0, sent_tokenize(p)) paragraphs_out.append(sentences) if self.state.terminate: break result = {} result['title'] = {} while retry: try: ccgbank = grpc.ccg_parse(self.aws.stub, title, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(ccgbank) ccg = process_ccg_pt(pt, options=self.options) result['title']['lexemes'] = [x.get_json() for x in ccg.get_span()] result['title']['constituents'] = [c.get_json() for c in ccg.constituents] ccgpara = [] result['paragraphs'] = ccgpara for sentences in paragraphs_out: ccgsent = [] ccgpara.append(ccgsent) for s in sentences: smod = preprocess_sentence(s) ccgbank = grpc.ccg_parse(self.aws.stub, smod, grpc.DEFAULT_SESSION) pt = parse_ccg_derivation(ccgbank) ccg = process_ccg_pt(pt, options=self.options) ccgentry = {} ccgentry['lexemes'] = [x.get_json() for x in ccg.get_span()] ccgentry['constituents'] = [c.get_json() for c in ccg.constituents] ccgsent.append(ccgentry) break # exit while except requests.exceptions.ConnectionError as e: time.sleep(0.25) retry -= 1 _logger.exception('AwsNewsQueueReader.run', exc_info=e) if self.state.pass_on_exceptions: raise except Exception as e: # After X reads AWS sends the item to the dead letter queue. # X is configurable in AWS console. retry = 0 _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash) if self.state.pass_on_exceptions: raise if self.state.terminate: retry = 0 break # retry == 0 indicates failure if retry == 0: continue try: # Let the queue know that the message is processed message.delete() if self.aws.ccg_queue: ireduce = -1 iorig = len(result['paragraphs']) while True: strm = StringIO.StringIO() # Add indent so easier to debug json.dump(result, strm, indent=2) data = strm.getvalue() if len(data) >= 200*1024: para = result['paragraphs'] ireduce = max([1, (len(para) * 200 * 1024)/ len(data)]) ireduce = min([len(para)-1, ireduce]) result['paragraphs'] = para[0:ireduce] else: break if len(result['paragraphs']) <= 1: break if ireduce >= 0: _logger.warning('Hash(%s) ccg paragraphs reduced from %d to %d' % (mhash, iorig, ireduce)) response = self.aws.ccg_queue.send_message(MessageAttributes=attributes, MessageBody=data) _logger.debug('Sent hash(%s) -> ccg_queue(%s)', mhash, response['MessageId']) except Exception as e: _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash) if self.state.pass_on_exceptions: raise
ccg = None drs = None pccg = None fol = None constituents = None orphaned = None conjoins = None functor_phrases = None vnconstituents = '' constituents = '' vsent = None sentence = None if options.ofmt == 'drs': try: pt = parse_ccg_derivation(ccg) pccg = pt_to_ccg_derivation(pt) except Exception as e: print('Error: failed to parse ccgbank - %s' % str(e)) raise ops = CO_BUILD_STATES if options.wordvars else CO_ADD_STATE_PREDICATES ops |= CO_NO_VERBNET if options.no_vn else 0 ops |= CO_NO_WIKI_SEARCH if options.no_wp else 0 try: sentence = process_ccg_pt(pt, ops) d = sentence.get_drs() fol, _ = d.to_fol() fol = unicode(fol) drs = d.show(SHOW_LINEAR)
def test2_GOLD_Wsj0003_1(self): # A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths # among a group of workers exposed to it more than 30 years ago, researchers reported. # ID=wsj_0003.1 PARSER=GOLD NUMPARSE=1 # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) # (<L N NN NN form N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) # (<T NP 0 1> # (<L N NN NN asbestos N>) # ) # ) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 1 2> # (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) # (<T S[to]\NP 0 2> # (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) # (<T S[b]\NP 0 2> # (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Kent N_222/N_222>) # (<T N 1 2> # (<L N/N NN NN cigarette N_215/N_215>) # (<L N NNS NNS filters N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) # (<T S[pt]\NP 0 2> # (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) # (<T NP 0 2> # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) # (<T N 1 2> # (<L N/N JJ JJ high N_41/N_41>) # (<L N NN NN percentage N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NN NN cancer N_64/N_64>) # (<L N NNS NNS deaths N>) # ) # ) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) # (<T NP 0 2> # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) # (<L N NN NN group N>) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) # (<T NP 0 2> # (<T NP 0 1> # (<L N NNS NNS workers N>) # ) # (<T NP\NP 0 1> # (<T S[pss]\NP 0 2> # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) # (<T PP 0 2> # (<L PP/NP TO TO to PP/NP_106>) # (<L NP PRP PRP it NP>) # ) # ) # (<T (S\NP)\(S\NP) 1 2> # (<T NP 0 1> # (<T N 1 2> # (<T N/N 1 2> # (<T (N/N)/(N/N) 1 2> # (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) # (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) # ) # (<L N/N CD CD 30 N_131/N_131>) # ) # (<L N NNS NNS years N>) # ) # ) # (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<T S[dcl]\S[dcl] 1 2> # (<L , , , , ,>) # (<T S[dcl]\S[dcl] 1 2> # (<T NP 0 1> # (<L N NNS NNS researchers N>) # ) # (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT A NP[nb]_166/N_166>) (<L N NN NN form N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_174\NP_174)/NP_175>) (<T NP 0 1> (<L N NN NN asbestos N>) ) ) ) (<T NP\NP 0 1> (<T S[pss]\NP 1 2> (<L (S\NP)/(S\NP) RB RB once (S_235\NP_230)_235/(S_235\NP_230)_235>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/(S[to]\NP) VBN VBN used (S[pss]\NP_187)/(S[to]_188\NP_187:B)_188>) (<T S[to]\NP 0 2> (<L (S[to]\NP)/(S[b]\NP) TO TO to (S[to]\NP_197)/(S[b]_198\NP_197:B)_198>) (<T S[b]\NP 0 2> (<L (S[b]\NP)/NP VB VB make (S[b]\NP_205)/NP_206>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Kent N_222/N_222>) (<T N 1 2> (<L N/N NN NN cigarette N_215/N_215>) (<L N NNS NNS filters N>) ) ) ) ) ) ) ) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pt]\NP) VBZ VBZ has (S[dcl]\NP_23)/(S[pt]_24\NP_23:B)_24>) (<T S[pt]\NP 0 2> (<L (S[pt]\NP)/NP VBN VBN caused (S[pt]\NP_31)/NP_32>) (<T NP 0 2> (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_46/N_46>) (<T N 1 2> (<L N/N JJ JJ high N_41/N_41>) (<L N NN NN percentage N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_54\NP_54)/NP_55>) (<T NP 0 1> (<T N 1 2> (<L N/N NN NN cancer N_64/N_64>) (<L N NNS NNS deaths N>) ) ) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN among (NP_73\NP_73)/NP_74>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_81/N_81>) (<L N NN NN group N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_89\NP_89)/NP_90>) (<T NP 0 2> (<T NP 0 1> (<L N NNS NNS workers N>) ) (<T NP\NP 0 1> (<T S[pss]\NP 0 2> (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/PP VBN VBN exposed (S[pss]\NP_100)/PP_101>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_106>) (<L NP PRP PRP it NP>) ) ) (<T (S\NP)\(S\NP) 1 2> (<T NP 0 1> (<T N 1 2> (<T N/N 1 2> (<T (N/N)/(N/N) 1 2> (<L S[adj]\NP RBR RBR more S[adj]\NP_153>) (<L ((N/N)/(N/N))\(S[adj]\NP) IN IN than ((N_147/N_139)_147/(N_147/N_139)_147)\(S[adj]_148\NP_142)_148>) ) (<L N/N CD CD 30 N_131/N_131>) ) (<L N NNS NNS years N>) ) ) (<L ((S\NP)\(S\NP))\NP IN IN ago ((S_121\NP_116)_121\(S_121\NP_116)_121)\NP_122>) ) ) ) ) ) ) ) ) ) ) ) (<T S[dcl]\S[dcl] 1 2> (<L , , , , ,>) (<T S[dcl]\S[dcl] 1 2> (<T NP 0 1> (<L N NNS NNS researchers N>) ) (<L (S[dcl]\S[dcl])\NP VBD VBD reported (S[dcl]\S[dcl]_8)\NP_9>) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(A #form)', # 0 'PP(#of)', # 1 'NP(#asbestos)', # 2 'ADVP(once #used to make Kent cigarette filters)', # 3 'S_INF(#to make)', # 4 'NP(Kent cigarette #filters)', # 5 'VP(#has caused)', # 6 'NP(a high #percentage)', # 7 'PP(#of)', # 8 'NP(cancer #deaths)', # 9 'PP(#among)', #10 'NP(a #group)', #11 'PP(#of)', #12 'NP(#workers)', #13 'ADVP(#exposed to it more than 30 years ago)', #14 'NP(more than 30 #years)', #15 'NP(#researchers)', #16 'VP(#reported)', #17 ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 17 VP(reported.) # 06 VP(has caused) # 00 NP(A form) # 01 PP(of) # 02 NP(asbestos) # 03 ADVP(once used to make Kent cigarette filters) # 04 S_INF(to make) # 05 NP(Kent cigarette filters) # 07 NP(a high percentage) # 08 PP(of) # 09 NP(cancer deaths) # 10 PP(among) # 11 NP(a group) # 12 PP(of) # 13 NP(workers) # 14 ADVP(exposed to it more than 30 years ago) # 15 NP(more than 30 years) # 16 NP(reserchers) x = (17, [(6, [(0, [(1, [(2, [])]), (3, [(4, [(5, [])])])]), (7, [(8, [(9, [])]), (10, [(11, [(12, [(13, [(14, [(15, [])])])])])])])]), (16, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_2(self): # Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group . # # PARG # 1 0 N/N 1 Vinken Mr. # 1 2 (S[dcl]\NP)/NP 1 Vinken is # 3 2 (S[dcl]\NP)/NP 2 chairman is # 3 4 (NP\NP)/NP 1 chairman of # 6 4 (NP\NP)/NP 2 N.V. of # 6 5 N/N 1 N.V. Elsevier # 11 4 (NP\NP)/NP 2 group of # 11 8 NP[nb]/N 1 group the # 11 9 N/N 1 group Dutch # 11 10 N/N 1 group publishing txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Mr. N_142/N_142>) (<L N NNP NNP Vinken N>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ is (S[dcl]\NP_87)/NP_88>) (<T NP 0 2> (<T NP 0 1> (<L N NN NN chairman N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_99\NP_99)/NP_100>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Elsevier N_109/N_109>) (<L N NNP NNP N.V. N>) ) ) (<T NP[conj] 1 2> (<L , , , , ,>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_131/N_131>) (<T N 1 2> (<L N/N NNP NNP Dutch N_126/N_126>) (<T N 1 2> (<L N/N VBG VBG publishing N_119/N_119>) (<L N NN NN group N>) ) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(#Mr.-Vinken)', 'VP(#is)', 'NP(#chairman)', 'PP(#of)', 'NP(#Elsevier-N.V.)', 'NP(the Dutch publishing #group)', ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 01 VP(is) # 00 NP(Mr.-Vinken) # 02 NP(chairman) # 03 PP(of Elsevier N.V. the Dutch publishing group) # 04 NP(Elsevier N.V.) # 05 NP(the Dutch publishing group) x = (1, [(0, []), (2, [(3, [(4, [(5, [])])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0001_1(self): # ID=wsj_0001.1 PARSER=GOLD NUMPARSE=1 # Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov 29. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Pierre N_73/N_73>) # (<L N NNP NNP Vinken N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 61 N_93/N_93>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) # (<T S[b]\NP 0 2> # (<T S[b]\NP 0 2> # (<T (S[b]\NP)/PP 0 2> # (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) # (<T NP 1 2> # (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) # (<L N NN NN board N>) # ) # ) # (<T PP 0 2> # (<L PP/NP IN IN as PP/NP_34>) # (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_43/N_43>) # (<L N NN NN director N>) # ) # ) # ) # ) # (<T (S\NP)\(S\NP) 0 2> # (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) # (<L N[num] CD CD 29 N[num]>) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Pierre N_73/N_73>) (<L N NNP NNP Vinken N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 61 N_93/N_93>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_83)\NP_84>) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[b]\NP) MD MD will (S[dcl]\NP_10)/(S[b]_11\NP_10:B)_11>) (<T S[b]\NP 0 2> (<T S[b]\NP 0 2> (<T (S[b]\NP)/PP 0 2> (<L ((S[b]\NP)/PP)/NP VB VB join ((S[b]\NP_20)/PP_21)/NP_22>) (<T NP 1 2> (<L NP[nb]/N DT DT the NP[nb]_29/N_29>) (<L N NN NN board N>) ) ) (<T PP 0 2> (<L PP/NP IN IN as PP/NP_34>) (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_48/N_48>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_43/N_43>) (<L N NN NN director N>) ) ) ) ) (<T (S\NP)\(S\NP) 0 2> (<L ((S\NP)\(S\NP))/N[num] NNP NNP Nov. ((S_61\NP_56)_61\(S_61\NP_56)_61)/N[num]_62>) (<L N[num] CD CD 29 N[num]>) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) # FIXME: VP(will #join) should be S_INF(will #join). # Issues occurs because I convert modal-verb combinator categories to modifiers. Must be fixed on functor # creation - Lexeme.get_production() # will: (S[dcl]\NP)/(S[b]/NP) -> (S\NP)/(S/NP) x = [ 'NP(#Pierre-Vinken)', 'ADJP(61 years #old)', 'NP(61 #years)', 'VP(#will join)', 'NP(the #board)', 'PP(#as)', 'NP(a nonexecutive #director)', 'NP(#Nov. 29)' ] dprint('\n'.join(a)) self.assertListEqual(x, a) # 03 VP(will join) # 00 NP(Pierre-Vinken) # 01 ADJP(61 years old) # 02 NP(61 years) # 04 NP(the board) # 05 PP(as) # 06 NP(a nonexecutive director) # 07 NP(Nov. 29) x = (3, [(0, [(1, [(2, [])])]), (4, []), (5, [(6, [])]), (7, [])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def test2_GOLD_Wsj0002_1(self): # ID=wsj_0002.1 PARSER=GOLD NUMPARSE=1 # Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive # director of this British industrial conglomerate. # (<T S[dcl] 0 2> # (<T S[dcl] 1 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Rudolph N_72/N_72>) # (<L N NNP NNP Agnew N>) # ) # ) # (<L , , , , ,>) # ) # (<T NP\NP 0 1> # (<T S[adj]\NP 0 2> # (<T S[adj]\NP 1 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N CD CD 55 N_92/N_92>) # (<L N NNS NNS years N>) # ) # ) # (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) # ) # (<T S[adj]\NP[conj] 1 2> # (<L conj CC CC and conj>) # (<T NP 0 2> # (<T NP 0 1> # (<T N 1 2> # (<L N/N JJ JJ former N_102/N_102>) # (<L N NN NN chairman N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) # (<T NP 0 1> # (<T N 1 2> # (<L N/N NNP NNP Consolidated N_135/N_135>) # (<T N 1 2> # (<L N/N NNP NNP Gold N_128/N_128>) # (<T N 1 2> # (<L N/N NNP NNP Fields N_121/N_121>) # (<L N NNP NNP PLC N>) # ) # ) # ) # ) # ) # ) # ) # ) # ) # ) # (<L , , , , ,>) # ) # (<T S[dcl]\NP 0 2> # (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) # (<T S[pss]\NP 0 2> # (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) # (<T NP 0 2> (<T NP 1 2> # (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) # (<T N 1 2> # (<L N/N JJ JJ nonexecutive N_28/N_28>) # (<L N NN NN director N>) # ) # ) # (<T NP\NP 0 2> # (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) # (<T NP 1 2> # (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) # (<T N 1 2> # (<L N/N JJ JJ British N_58/N_58>) # (<T N 1 2> # (<L N/N JJ JJ industrial N_51/N_51>) # (<L N NN NN conglomerate N>) # ) # ) # ) # ) # ) # ) # ) # ) # (<L . . . . .>) # ) txt = r'''(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Rudolph N_72/N_72>) (<L N NNP NNP Agnew N>) ) ) (<L , , , , ,>) ) (<T NP\NP 0 1> (<T S[adj]\NP 0 2> (<T S[adj]\NP 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N CD CD 55 N_92/N_92>) (<L N NNS NNS years N>) ) ) (<L (S[adj]\NP)\NP JJ JJ old (S[adj]\NP_82)\NP_83>) ) (<T S[adj]\NP[conj] 1 2> (<L conj CC CC and conj>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ former N_102/N_102>) (<L N NN NN chairman N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_111\NP_111)/NP_112>) (<T NP 0 1> (<T N 1 2> (<L N/N NNP NNP Consolidated N_135/N_135>) (<T N 1 2> (<L N/N NNP NNP Gold N_128/N_128>) (<T N 1 2> (<L N/N NNP NNP Fields N_121/N_121>) (<L N NNP NNP PLC N>) ) ) ) ) ) ) ) ) ) ) (<L , , , , ,>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[pss]\NP) VBD VBD was (S[dcl]\NP_10)/(S[pss]_11\NP_10:B)_11>) (<T S[pss]\NP 0 2> (<L (S[pss]\NP)/NP VBN VBN named (S[pss]\NP_18)/NP_19>) (<T NP 0 2> (<T NP 1 2> (<L NP[nb]/N DT DT a NP[nb]_33/N_33>) (<T N 1 2> (<L N/N JJ JJ nonexecutive N_28/N_28>) (<L N NN NN director N>) ) ) (<T NP\NP 0 2> (<L (NP\NP)/NP IN IN of (NP_41\NP_41)/NP_42>) (<T NP 1 2> (<L NP[nb]/N DT DT this NP[nb]_63/N_63>) (<T N 1 2> (<L N/N JJ JJ British N_58/N_58>) (<T N 1 2> (<L N/N JJ JJ industrial N_51/N_51>) (<L N NN NN conglomerate N>) ) ) ) ) ) ) ) ) (<L . . . . .>) )''' pt = parse_ccg_derivation(txt) self.assertIsNotNone(pt) s = sentence_from_pt(pt) dprint(s) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.resolve_proper_names() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) dprint('\n'.join(a)) # Hash indicates head word in constituent x = [ 'NP(#Rudolph-Agnew)', 'ADJP(55 years #old and former chairman of Consolidated-Gold-Fields-PLC)', 'NP(55 #years)', 'NP(former #chairman)', 'PP(#of)', 'NP(#Consolidated-Gold-Fields-PLC)', 'VP(#was named)', 'NP(a nonexecutive #director)', 'PP(#of)', 'NP(this British industrial #conglomerate)' ] self.assertListEqual(x, a) # 6 VP(was named) # 0 NP(Rudolph-Agnew) # 1 ADVP(55 years old former chairman of Consolidated-Gold-Fields-PLC) # 2 NP(55 years) # 3 NP(former chairman) # 4 PP(of) # 5 NP(Consolidated-Gold-Fields-PLC) # 7 NP(a nonexecutive director) # 8 PP(of) # 9 NP(this British industrial conglomerate) x = (6, [(0, [(1, [(2, []), (3, [(4, [(5, [])])])])]), (7, [(8, [(9, [])])])]) a = sent.get_constituent_tree() dprint_constituent_tree(sent, a) self.assertEqual(repr(x), repr(a))
def make_drs(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'drs') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = os.listdir(ldcpath) for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg2drs = [] start = 0 progress = -1 for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') if not os.path.exists(os.path.join(easysrl_path, idx)): os.mkdir(os.path.join(easysrl_path, idx)) with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() pccg = pt_to_ccg_derivation(pt) except Exception: failed_parse += 1 raise continue try: d = process_ccg_pt( pt, CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH).get_drs() assert d is not None assert isinstance(d, DRS) d = d.show(SHOW_LINEAR).strip() except Exception as e: print(e) failed_ccg2drs.append((name, i, ccgbank)) raise continue with open( os.path.join(easysrl_path, idx, 'drs_%s_%04d.dat' % (idx, i)), 'w') as fd: fd.write(b'<sentence>\n') fd.write(safe_utf8_encode(s)) fd.write(b'\n</sentence>\n<drs>\n') fd.write(safe_utf8_encode(d)) fd.write(b'\n</drs>\n<predarg>\n') fd.write(safe_utf8_encode(pccg)) fd.write(b'\n') fd.write(b'</predarg>\n') if failed_parse != 0: print('%d derivations failed to parse' % failed_parse) if len(failed_ccg2drs) != 0: print('%d derivations failed to convert to DRS' % len(failed_ccg2drs)) for x in failed_ccg2drs: print('%s-%04d failed: {%s}' % x)
def test2_GOLD_Wsj0051_13(self): txt = r''' (<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 1 2> (<L NP[nb]/N DT DT The NP[nb]_273/N_273>) (<L N NNS NNS bids N>) ) (<T S[dcl]\NP 1 2> (<T (S\NP)/(S\NP) 1 2> (<L , , , , ,>) (<T (S\NP)/(S\NP) 0 2> (<T S[dcl]/S[dcl] 1 2> (<T S/(S\NP) 0 1> (<L NP PRP PRP he NP>) ) (<L (S[dcl]\NP)/S[dcl] VBD VBD added (S[dcl]\NP_242)/S[dcl]_243>) ) (<L , , , , ,>) ) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/(S[adj]\NP) VBD VBD were (S[dcl]\NP_211)/(S[adj]_212\NP_211:B)_212>) (<T S[adj]\NP 0 2> (<L (S[adj]\NP)/PP JJ JJ contrary (S[adj]\NP_219)/PP_220>) (<T PP 0 2> (<L PP/NP TO TO to PP/NP_225>) (<T NP 0 1> (<T N 1 2> (<L N/N JJ JJ common N_234/N_234>) (<L N NN NN sense N>) ) ) ) ) ) ) ) (<L . . . . .>) ) ''' pt = parse_ccg_derivation(txt) s = sentence_from_pt(pt) dprint(s) self.assertIsNotNone(pt) ccg = Ccg2Drs(CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH) ccg.build_execution_sequence(pt) ccg.create_drs() ccg.final_rename() d = ccg.get_drs() s = d.show(SHOW_LINEAR) dprint(s) sent = ccg.get_verbnet_sentence() a = get_constituents_string_list(sent) x = [ 'NP(The #bids)', 'ADVP(he #added)', 'VP(#were)', 'ADJP(#contrary to common sense)', 'PP(#to)', 'NP(common #sense)' ] dprint('\n'.join(a)) self.assertListEqual(x, a)