コード例 #1
0
    def test1_JsonFiles(self):
        filelist = os.listdir(datapath)
        allfiles = []
        for fn in filelist:
            if not os.path.isfile(os.path.join(datapath, fn)):
                continue
            f, x = os.path.splitext(fn)
            if x == '.json' and f == '9255a890ffe40c05876d8d402044ab11':
                allfiles.append(os.path.join(datapath, fn))

        for fn in allfiles:
            with open(fn, 'r') as fd:
                body = json.load(fd, encoding='utf-8')

            smod = preprocess_sentence(body['title'])
            ccgbank = grpc.ccg_parse(self.stub, smod, grpc.DEFAULT_SESSION)
            pt = parse_ccg_derivation(ccgbank)
            ccg = process_ccg_pt(pt)

            ccgbody = {}
            ccgbody['story'] = {
                'title': [x.get_json() for x in ccg.get_span()],
                'paragraphs': []
            }
            paragraphs = filter(
                lambda y: len(y) != 0,
                map(lambda x: x.strip(), body['content'].split('\n')))
            i = 0
            for p in paragraphs[i:]:
                sentences = filter(lambda x: len(x.strip()) != 0,
                                   sent_tokenize(p))
                sp = []
                j = 0
                for s in sentences[j:]:
                    dprint('p:s = %d:%d' % (i, j))
                    smod = preprocess_sentence(s)
                    ccgbank = grpc.ccg_parse(self.stub, smod,
                                             grpc.DEFAULT_SESSION)
                    pt = parse_ccg_derivation(ccgbank)
                    ccg = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
                    sp.append([x.get_json() for x in ccg.get_span()])
                    j += 1
                ccgbody['story']['paragraphs'].append(sp)
                i += 1

            msgbody = json.dumps(ccgbody)

        pass
コード例 #2
0
 def test1_PP_Attachment(self):
     # NCCG get the PP attachment wrong
     txt = "Eat spaghetti with meatballs"
     derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     self.assertIsNotNone(pt)
     s = sentence_from_pt(pt)
     dprint(s)
     sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sent.get_drs()
     s = d.show(SHOW_LINEAR)
     dprint(s)
     a = get_constituents_string_list(sent)
     dprint('\n'.join(a))
     x = [
         'S_INF(#Eat spaghetti with meatballs)',  # 0
         'NP(#spaghetti)',  # 1
         'NP(#meatballs)',  # 2
     ]
     self.assertListEqual(x, a)
     x = (0, [(1, []), (2, [])])
     a = sent.get_constituent_tree()
     dprint_constituent_tree(sent, a)
     self.assertEqual(repr(x), repr(a))
     vsent = get_constituent_string(sent.get_verbnet_sentence())
     self.assertEqual('S_INF(#Eat with) NP(#spaghetti) NP(#meatballs)',
                      vsent)
コード例 #3
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test02_AndOfObj(self):
     text = "He saw John and Paul"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('John' in phrases)
     self.assertTrue('Paul' in phrases)
     self.assertTrue('saw' in phrases)
     john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0]
     paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0]
     saw = filter(lambda x: 'saw' == x[1].text, f.iteritems())[0]
     J = john[0]
     P = paul[0]
     E = saw[0]
     # FIXME: wn lemmatizer does not convert saw to see - I guess to to ambiguity
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('saw', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('John', [J])) is not None)
     self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E, J])) is not None)
コード例 #4
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test10_OrOfVerb_OrInBrackets(self):
     text = "That which is perceived or known or inferred to have its own distinct existence (living or nonliving)"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs(nodups=True)
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     # RT_EMPTY_DRS adds 'or' to phrases
     f = sentence.select_phrases(lambda x: x.pos is POS.from_cache('WDT') or \
                                                0 == (x.mask & RT_EMPTY_DRS),
                                 contiguous=False)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('That which' in phrases)
     self.assertTrue('have' in phrases)
     self.assertTrue('is perceived known inferred' in phrases)
     self.assertTrue('its own distinct existence' in phrases)
     verb1 = filter(lambda x: 'is perceived known inferred' == x[1].text,
                    f.iteritems())[0]
     verb2 = filter(lambda x: 'have' == x[1].text, f.iteritems())[0]
     agent = filter(lambda x: 'That which' == x[1].text, f.iteritems())[0]
     theme = filter(lambda x: 'its own distinct existence' == x[1].text,
                    f.iteritems())[0]
     X1 = agent[0]
     E1 = verb1[0]
     E2 = verb2[0]
     X2 = theme[1][0].refs[1]
     X3 = theme[1][1].refs[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, E2])) is not None)
     # TODO: should the theme attach to X2?
     self.assertTrue(d.find_condition(Rel('_ARG1', [E2, X3])) is not None)
     self.assertTrue(d.find_condition(Rel('_POSS', [X2, X3])) is not None)
コード例 #5
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test01_AndOfSubj(self):
     text = "John and Paul went to the movies"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('John' in phrases)
     self.assertTrue('Paul' in phrases)
     self.assertTrue('went' in phrases)
     john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0]
     paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0]
     went = filter(lambda x: 'went' == x[1].text, f.iteritems())[0]
     J = john[0]
     P = paul[0]
     E = went[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('go', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('John', [J])) is not None)
     self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E, J])) is not None)
コード例 #6
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test03_OrOfObj(self):
     text = "To participate in games or sport"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_ENTITY | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('participate' in phrases)
     self.assertTrue('games' in phrases)
     self.assertTrue('sport' in phrases)
     noun1 = filter(lambda x: 'games' == x[1].text, f.iteritems())[0]
     noun2 = filter(lambda x: 'sport' == x[1].text, f.iteritems())[0]
     verb = filter(lambda x: 'participate' == x[1].text, f.iteritems())[0]
     X1 = noun1[0]
     X2 = noun2[0]
     E = verb[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('participate', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('games', [X1])) is not None)
     self.assertTrue(d.find_condition(Rel('sport', [X2])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E, X2])) is not None)
コード例 #7
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test05_AndOfVerb_AndOfObj(self):
     text = "Bell makes and distributes computers, electronics, and building products"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT
                                 | RT_ATTRIBUTE)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('makes distributes' in phrases)
     self.assertTrue('computers' in phrases)
     self.assertTrue('electronics' in phrases)
     # Note if we add RT_EMPTY_DRS to the selection criteria then this phrase becomes 'and building products'
     self.assertTrue('building products' in phrases)
     self.assertEqual(5, len(phrases))
     verb1 = filter(lambda x: 'makes distributes' == x[1].text,
                    f.iteritems())[0]
     agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0]
     theme1 = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0]
     theme2 = filter(lambda x: 'electronics' == x[1].text, f.iteritems())[0]
     theme3 = filter(lambda x: 'building products' == x[1].text,
                     f.iteritems())[0]
     X1 = agent[0]
     Y1 = theme1[0]
     Y2 = theme2[0]
     Y3 = theme3[0]
     E1 = verb1[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     # TODO: should we add proposition for multi NP's conjoined?
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, Y3])) is not None)
コード例 #8
0
ファイル: process_text.py プロジェクト: marbles-ai/ie
def process_file(stub, out, args, titleSrch, wordsep, sessionId):
    id = 0
    for fn in args:
        line = ''
        write_hdr(out)
        line_number = 0
        try:
            with open(fn, 'r') as fd:
                while True:
                    ln = fd.readline()
                    line_number += 1
                    if len(ln) == 0:
                        # end of file
                        write_footer(out)
                        break
                    ln = ln.strip()
                    if len(ln) == 0:
                        line = ''
                        continue

                    m = titleSrch.match(ln)
                    if m is not None:
                        line = ''
                        ccg = grpc.ccg_parse(stub, ln, sessionId)
                        write_title(out, id, ln, ccg)
                        continue
                    else:
                        ln = line + ln
                        sentences = ln.split('.')
                        for s in sentences[:-1]:
                            x = s.strip()
                            if len(x) == 0: continue
                            id += 1
                            ccg = grpc.ccg_parse(stub, x, sessionId)
                            write_line(out, id, x, ccg)

                        if len(sentences) != 0:
                            s = sentences[-1].strip()
                            if len(s) != 0:
                                if sentences[-1][-1] == wordsep:
                                    line = sentences[-1][0:-1]
                                else:
                                    line = sentences[-1] + ' '
        except:
            print('Exception while processing file "%s" at line %d' %
                  (fn, line_number))
            raise
コード例 #9
0
ファイル: number_test.py プロジェクト: marbles-ai/ie
 def test1_Currency_00_0194(self):
     text = r"Without the Cray-3 research and development expenses, the company would have been able to report a profit of $19.3 million for the first half of 1989 rather than the $5.9 million it posted."
     etext = r"Without the Cray-3 research and development expenses , the company would have been able to report a profit of $ 19.3 million for the first half of 1989 rather than the $ 5.9 million it posted"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs(nodups=True)
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('the Cray-3 research and development expenses' in nps)
     self.assertTrue('the company' in nps)
     self.assertTrue('a profit' in nps)
     self.assertTrue('$ 19.3 million' in nps)
     self.assertTrue('the first half' in nps)
     self.assertTrue('the $ 5.9 million' in nps)
     self.assertTrue('1989' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('would have been' in vps)
     self.assertTrue('report' in vps)
     self.assertTrue('posted' in vps)
     would_have_been = filter(lambda x: 'would have been' == x[1].text,
                              fvps)[0][0]
     report = filter(lambda x: 'report' == x[1].text, fvps)[0][0]
     posted = filter(lambda x: 'posted' == x[1].text, fvps)[0][0]
     cray_rnd = filter(
         lambda x: 'the Cray-3 research and development expenses' == x[1].
         text, fnps)[0][0]
     company = filter(lambda x: 'the company' == x[1].text, fnps)[0][0]
     profit = filter(lambda x: 'a profit' == x[1].text, fnps)[0][0]
     first_half = filter(lambda x: 'the first half' == x[1].text,
                         fnps)[0][0]
     n1989 = filter(lambda x: '1989' == x[1].text, fnps)[0][0]
     n19_3M = filter(lambda x: '$ 19.3 million' == x[1].text, fnps)[0][0]
     n5_9M = filter(lambda x: 'the $ 5.9 million' == x[1].text, fnps)[0][0]
     self.assertTrue(
         d.find_condition(Rel('without', [would_have_been, cray_rnd]))
         is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [would_have_been, company]))
         is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [report, company])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [report, profit])) is not None)
     self.assertTrue(
         d.find_condition(Rel('of', [profit, n19_3M])) is not None)
     self.assertTrue(
         d.find_condition(Rel('for', [profit, first_half])) is not None)
     self.assertTrue(
         d.find_condition(Rel('of', [first_half, n1989])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [posted, n5_9M])) is not None)
コード例 #10
0
def make_derivations(daemon):
    global pypath, projdir, datapath, idsrch
    allfiles = []
    esrlpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank')
    if not os.path.exists(esrlpath):
        os.makedirs(esrlpath)

    progress = 0
    svc = grpc.CcgParserService(daemon)
    stub = svc.open_client()

    failed_total = 0
    ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data',
                           'RAW')
    dirlist = os.listdir(ldcpath)

    try:
        for fname in dirlist:
            ldcpath1 = os.path.join(ldcpath, fname)
            with open(ldcpath1, 'r') as fd:
                lines = fd.readlines()

            m = idsrch.match(os.path.basename(ldcpath1))
            if m is None:
                continue

            derivations = []
            failed_parse = []
            for ln in lines:
                # Parse with EasySRL via gRPC
                try:
                    ccg = grpc.ccg_parse(stub, ln)
                    derivations.append(safe_utf8_encode(ccg.replace('\n', '')))
                except Exception as e:
                    failed_parse.append(safe_utf8_encode(ln.strip()))
                    # Add comment so line numbers match id's
                    derivations.append(
                        safe_utf8_encode('# FAILED: ' + ln.strip()))
                progress = print_progress(progress, 10)
            id = m.group('id')
            if len(derivations) != 0:
                with open(os.path.join(esrlpath, 'ccg_derivation%s.txt' % id),
                          'w') as fd:
                    fd.write(b'\n'.join(derivations))

            failed_total += len(failed_parse)
            if len(failed_parse) != 0:
                with open(os.path.join(esrlpath, 'ccg_failed%s.txt' % id),
                          'w') as fd:
                    fd.write(b'\n'.join(failed_parse))
    finally:
        print_progress(progress, 10, done=True)
        svc.shutdown()

    if failed_total != 0:
        print('THERE WERE %d PARSE FAILURES' % failed_total)
コード例 #11
0
 def test2_Wsj_0056_1(self):
     # RAW 1043
     txt = '''@'''
     derivation = grpc.ccg_parse(self.stub, txt, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     self.assertIsNotNone(pt)
     s = sentence_from_pt(pt)
     dprint(s)
     sent = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sent.get_drs()
     s = d.show(SHOW_LINEAR)
     dprint(s)
     a = get_constituents_string_list(sent)
     dprint('\n'.join(a))
     x = ['S(#@)']
     self.assertListEqual(x, a)
コード例 #12
0
ファイル: number_test.py プロジェクト: marbles-ai/ie
 def test2_Date_21_0985(self):
     text = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis: 1989 and Wednesday October 4, 1989."
     etext = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis : 1989 and Wednesday October 4 , 1989"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('Annualized interest rates' in nps)
     self.assertTrue('certain investments' in nps)
     self.assertTrue('the Federal-Reserve-Board' in nps)
     self.assertTrue('a weekly-average basis' in nps)
     self.assertTrue('Wednesday October 4' in nps)
コード例 #13
0
ファイル: number_test.py プロジェクト: marbles-ai/ie
 def test2_Date_00_1228(self):
     text = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15"
     etext = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('The reduced dividend' in nps)
     self.assertTrue('payable' in nps)
     self.assertTrue('Jan. 2' in nps)
     self.assertTrue('Dec. 15' in nps)
     self.assertTrue('stock' in nps)
     self.assertTrue('record' in nps)
コード例 #14
0
 def test4_ApposInterrupt(self):
     text = r"Bell, a telecommunications company, which is located in Los Angeles, makes and distributes electronics, computers, and building products"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('a telecommunications company' in phrases)
     np1 = filter(lambda x: 'Bell' == x[1].text, f)[0]
     np2 = filter(lambda x: 'a telecommunications company' == x[1].text, f)[0]
     X = np1[0]
     Y = np2[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)
コード例 #15
0
 def test3_ApposInterrupt(self):
     text = r"Robbie, a hot-tempered tennis player, charged the umpire and tried to crack the poor man's skull with a racket."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Robbie' in phrases)
     self.assertTrue('a hot-tempered tennis player' in phrases)
     robbie = filter(lambda x: 'Robbie' == x[1].text, f)[0]
     temper = filter(lambda x: 'a hot-tempered tennis player' == x[1].text, f)[0]
     X = robbie[0]
     Y = temper[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)
コード例 #16
0
 def test2_ApposInterrupt(self):
     text = r"Reliable, Diane's eleven-year-old beagle, chews holes in the living room carpeting as if he were still a puppy."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Reliable' in phrases)
     self.assertTrue("eleven-year-old beagle" in phrases)
     self.assertTrue("Diane" in phrases)
     dog = filter(lambda x: 'Reliable' == x[1].text, f)[0]
     breed = filter(lambda x: "eleven-year-old beagle" == x[1].text, f)[0]
     X = dog[0]
     Y = breed[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)
コード例 #17
0
ファイル: number_test.py プロジェクト: marbles-ai/ie
 def test1_Currency_00_0195(self):
     text = r"On the other hand, had it existed then, Cray Computer would have incurred a $20.5 million loss."
     etext = r"On the other hand , had it existed then , Cray Computer would have incurred a $ 20.5 million loss ."
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('the other hand' in nps)
     self.assertTrue('Cray-Computer' in nps)
     self.assertTrue('$ 20.5 million' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('had' in vps)
     self.assertTrue('existed' in vps)
     self.assertTrue('would have incurred' in vps)
コード例 #18
0
ファイル: passive_test.py プロジェクト: marbles-ai/ie
 def test10_Brutus(self):
     text = "Ceasar was stabbed by Brutus"
     derivation = grpc.ccg_parse(self.stub, text, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     #self.assertTrue('Average maturity' in nps)
     self.assertTrue('Brutus' in nps)
     self.assertTrue('Ceasar' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('was stabbed' in vps)
     E = filter(lambda x: x[1].text == "was stabbed", fvps)[0][0]
     A1 = filter(lambda x: x[1].text == "Brutus", fnps)[0][0]
     A0 = filter(lambda x: x[1].text == "Ceasar", fnps)[0][0]
     self.assertTrue(d.find_condition(Rel('_ARG0', [E, A0])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E, A1])) is not None)
コード例 #19
0
 def test10_Ccgbank_00_0036(self):
     text = "Average maturity of the funds' investments lengthened by a day to 41 days, the longest since early August, according to Donoghue's."
     etext = "Average maturity of the funds ' investments lengthened by a day to 41 days , the longest since early August , according to Donoghue 's ."
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     #self.assertTrue('Average maturity' in nps)
     self.assertTrue('the funds' in nps)
     self.assertTrue('a day' in nps)
     self.assertTrue('41 days' in nps)
     self.assertTrue('the longest' in nps)
     self.assertTrue('early August' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('lengthened' in vps)
     self.assertTrue('according' in vps)
コード例 #20
0
ファイル: conj_test.py プロジェクト: marbles-ai/ie
 def test04_AndOfVerb(self):
     text = "Bell makes and distributes computers"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('makes distributes' in phrases)
     self.assertTrue('computers' in phrases)
     verb1 = filter(lambda x: 'makes distributes' == x[1].text,
                    f.iteritems())[0]
     agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0]
     theme = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0]
     X1 = agent[0]
     X2 = theme[0]
     E1 = verb1[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, X2])) is not None)
コード例 #21
0
                lnout.append('RAW_LN=%d' % idx)
                lc = len(lnout)
                lnout.append(ln.strip())

                if mm not in wsjd:
                    lnout.append('ERR: cannot find mapping to %s' % mm)
                    total_err += 1
                    continue
                gold_derivation = wsjd[mm]

                e_sentence = None
                n_sentence = None
                options = CO_NO_VERBNET | CO_NO_WIKI_SEARCH | CO_VARNAMES_MATCH_WORD_INDEX
                try:
                    if estub is not None:
                        ed = grpc.ccg_parse(estub, ln)
                        ept = parse_ccg_derivation(ed)
                        e_sentence = process_ccg_pt(ept, options)

                    if nstub is not None:
                        nd = grpc.ccg_parse(nstub, ln)
                        npt = parse_ccg_derivation(nd)
                        n_sentence = process_ccg_pt(npt, options)

                    gpt = parse_ccg_derivation(gold_derivation)
                    gold_sentence = process_ccg_pt(gpt, options)
                except UnaryRuleError as e:
                    lnout.append('ERR: %s' % e)
                    total_err += 1
                    continue
                except Exception as e:
コード例 #22
0
 def test10_Ccgbank_00_0099(self):
     text = "Plans that give advertisers discounts for maintaining or increasing ad spending have become permanent fixtures at the news weeklies and underscore the fierce competition between Newsweek, Time Warner Inc.'s Time magazine, and Mortimer B. Zuckerman's U.S. News & World Report."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Plans' in phrases)
     self.assertTrue('advertisers' in phrases)
     self.assertTrue('discounts' in phrases)
     self.assertTrue('ad spending' in phrases)
     self.assertTrue('permanent fixtures' in phrases)
     self.assertTrue('the news weeklies' in phrases)
     self.assertTrue('the fierce competition' in phrases)
     self.assertTrue("Newsweek" in phrases)
     self.assertTrue("Time-Warner-Inc." in phrases)
     self.assertTrue("Time-magazine" in phrases)
     self.assertTrue("Mortimer-B.-Zuckerman" in phrases)
     self.assertTrue("U.S.-News-&-World-Report" in phrases)
     vf = sentence.get_vp_nominals()
     vphrases = [sp.text for r, sp in vf]
     self.assertTrue('give' in vphrases)
     self.assertTrue('maintaining increasing' in vphrases)
     self.assertTrue('have become' in vphrases)
     self.assertTrue('underscore' in vphrases)
     give = filter(lambda x: 'give' == x[1].text, vf)[0][0]
     become = filter(lambda x: 'have become' == x[1].text, vf)[0][0]
     uscore = filter(lambda x: 'underscore' == x[1].text, vf)[0][0]
     minc = filter(lambda x: 'maintaining increasing' == x[1].text,
                   vf)[0][0]
     plans = filter(lambda x: 'Plans' == x[1].text, f)[0][0]
     advertisers = filter(lambda x: 'advertisers' == x[1].text, f)[0][0]
     discounts = filter(lambda x: 'discounts' == x[1].text, f)[0][0]
     spending = filter(lambda x: 'ad spending' == x[1].text, f)[0][0]
     fixtures = filter(lambda x: 'permanent fixtures' == x[1].text, f)[0][0]
     weeklies = filter(lambda x: 'the news weeklies' == x[1].text, f)[0][0]
     timeinc = filter(lambda x: 'Time-Warner-Inc.' == x[1].text, f)[0][0]
     timemag = filter(lambda x: 'Time-magazine' == x[1].text, f)[0][0]
     mortimer = filter(lambda x: 'Mortimer-B.-Zuckerman' == x[1].text,
                       f)[0][0]
     uswr = filter(lambda x: 'U.S.-News-&-World-Report' == x[1].text,
                   f)[0][0]
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [give, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [give, advertisers])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG2', [give, discounts])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [minc, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [minc, spending])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [become, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [become, fixtures])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_POSS', [mortimer, uswr])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_POSS', [timeinc, timemag])) is not None)
コード例 #23
0
ファイル: __init__.py プロジェクト: marbles-ai/ie
    def run(self):
        """Process messages."""
        for message in receive_messages(self.aws.news_queue, MessageAttributeNames=['All']):
            global _logger
            # Attributes will be passed onto next queue
            attributes = message.message_attributes
            mhash = attributes['hash']['StringValue']
            _logger.debug('Received news_queue(%s) -> hash(%s)', message.message_id, mhash)
            body = json.loads(message.body)
            retry = 3
            ccgbank = None
            title = body['title']
            paragraphs_in = filter(lambda y: len(y) != 0, map(lambda x: x.strip(), body['content'].split('\n')))
            paragraphs_out = []
            if len(paragraphs_in) == 0:
                _logger.debug('No paragraphs for story %s\n%s', (mhash, title))
            # Use NLTK to split paragraphs into sentences.
            for p in paragraphs_in:
                sentences = filter(lambda x: len(x.strip()) != 0, sent_tokenize(p))
                paragraphs_out.append(sentences)

            if self.state.terminate:
                break

            result = {}
            result['title'] = {}
            while retry:
                try:
                    ccgbank = grpc.ccg_parse(self.aws.stub, title, grpc.DEFAULT_SESSION)
                    pt = parse_ccg_derivation(ccgbank)
                    ccg = process_ccg_pt(pt, options=self.options)
                    result['title']['lexemes'] = [x.get_json() for x in ccg.get_span()]
                    result['title']['constituents'] = [c.get_json() for c in ccg.constituents]
                    ccgpara = []
                    result['paragraphs'] = ccgpara
                    for sentences in paragraphs_out:
                        ccgsent = []
                        ccgpara.append(ccgsent)
                        for s in sentences:
                            smod = preprocess_sentence(s)
                            ccgbank = grpc.ccg_parse(self.aws.stub, smod, grpc.DEFAULT_SESSION)
                            pt = parse_ccg_derivation(ccgbank)
                            ccg = process_ccg_pt(pt, options=self.options)
                            ccgentry = {}
                            ccgentry['lexemes'] = [x.get_json() for x in ccg.get_span()]
                            ccgentry['constituents'] = [c.get_json() for c in ccg.constituents]
                            ccgsent.append(ccgentry)
                    break   # exit while
                except requests.exceptions.ConnectionError as e:
                    time.sleep(0.25)
                    retry -= 1
                    _logger.exception('AwsNewsQueueReader.run', exc_info=e)
                    if self.state.pass_on_exceptions:
                        raise
                except Exception as e:
                    # After X reads AWS sends the item to the dead letter queue.
                    # X is configurable in AWS console.
                    retry = 0
                    _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash)
                    if self.state.pass_on_exceptions:
                        raise

                if self.state.terminate:
                    retry = 0
                    break

            # retry == 0 indicates failure
            if retry == 0:
                continue


            try:
                # Let the queue know that the message is processed
                message.delete()
                if self.aws.ccg_queue:
                    ireduce = -1
                    iorig = len(result['paragraphs'])

                    while True:
                        strm = StringIO.StringIO()
                        # Add indent so easier to debug
                        json.dump(result, strm, indent=2)
                        data = strm.getvalue()
                        if len(data) >= 200*1024:
                            para = result['paragraphs']
                            ireduce = max([1, (len(para) * 200 * 1024)/ len(data)])
                            ireduce = min([len(para)-1, ireduce])
                            result['paragraphs'] = para[0:ireduce]
                        else:
                            break

                        if len(result['paragraphs']) <= 1:
                            break

                    if ireduce >= 0:
                        _logger.warning('Hash(%s) ccg paragraphs reduced from %d to %d' % (mhash, iorig, ireduce))
                    response = self.aws.ccg_queue.send_message(MessageAttributes=attributes, MessageBody=data)
                    _logger.debug('Sent hash(%s) -> ccg_queue(%s)', mhash, response['MessageId'])
            except Exception as e:
                _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash)
                if self.state.pass_on_exceptions:
                    raise
コード例 #24
0
ファイル: infox.py プロジェクト: marbles-ai/ie
    def parse(self, request, context):
        """Parse a message."""
        retry = 3
        while retry:
            if self.state.terminate:
                context.set_code(grpc.StatusCode.CANCELLED)
                context.set_details('Application terminating.')
                raise RuntimeError('Application terminating!')

            try:
                # EasyXXX does not handle these
                smod = preprocess_sentence(request.text)
                ccgbank = gsvc.ccg_parse(self.ccg_stub, smod,
                                         gsvc.DEFAULT_SESSION)
                pt = parse_ccg_derivation(ccgbank)
                ccg = process_ccg_pt(pt, options=request.options)
                sent = ccg.get_verbnet_sentence()

                response = infox_service_pb2.GSentence()
                for lex in sent:
                    glex = response.lexemes.add()
                    glex.head = lex.head
                    glex.idx = lex.idx
                    glex.mask = lex.mask
                    for r in lex.refs:
                        glex.refs.append(r.var.to_string())
                    glex.pos = lex.pos.tag
                    glex.word = lex.word
                    glex.stem = lex.stem
                    glex.category = lex.category.signature
                    if lex.wiki_data is not None:
                        glex.wikidata.title = lex.wiki_data.title
                        glex.wikidata.summary = lex.wiki_data.summary
                        glex.wikidata.page_categories.extend(
                            lex.wiki_data.page_categories)
                        glex.wikidata.url = lex.wiki_data.url

                for c in ccg.constituents:
                    gc = response.constituents.add()
                    gc.span.extend(c.span.get_indexes())
                    gc.vntype = c.vntype.signature
                    gc.head = c.chead

                return response

            except requests.exceptions.ConnectionError as e:
                self.state.wait(0.25)
                retry -= 1
                self.logger.exception('Infox.parse', exc_info=e)
                context.set_code(grpc.StatusCode.ABORTED)
                context.set_details(e.message)
                raise

            except Exception as e:
                retry = 0
                self.logger.exception('Infox.parse', exc_info=e)
                context.set_code(grpc.StatusCode.ABORTED)
                context.set_details(e.message)
                raise

        context.set_code(grpc.StatusCode.ABORTED)
        context.set_details('Too many retries!')
        raise RuntimeError('Too many retries!')
コード例 #25
0
ファイル: process_text.py プロジェクト: marbles-ai/ie
        if options.ofmt is not None:
            if options.ofmt not in [
                    'ccgbank', 'html', 'logic', 'extended', 'drs'
            ]:
                die('bad output format %s, must be ccgbank|html|logic|extended'
                    % options.ofmt)
            # Create a session to match output format, default is CCGBANK
            if options.ofmt != 'ccgbank' and options.ofmt != 'drs':
                sessionId = grpc.create_session(stub, options.ofmt)

        titleSrch = re.compile(titleRe)
        if not options.book:
            line = preprocess_sentence(' '.join(args))
            html = None
            # FIXME: Convert to python 3. Unicode is default.
            ccg = grpc.ccg_parse(stub, line, sessionId)
            if options.ofmt == 'html':
                html = ccg
                ccg = None
            drs = None
            pccg = None
            fol = None
            constituents = None
            orphaned = None
            conjoins = None
            functor_phrases = None
            vnconstituents = ''
            constituents = ''
            vsent = None
            sentence = None