Python preprocess_sentence Beispiele, marbles.ie.utils.text.preprocess_sentence Python Beispiele

Beispiel #1

0

Datei anzeigen

    def test1_JsonFiles(self):
        filelist = os.listdir(datapath)
        allfiles = []
        for fn in filelist:
            if not os.path.isfile(os.path.join(datapath, fn)):
                continue
            f, x = os.path.splitext(fn)
            if x == '.json' and f == '9255a890ffe40c05876d8d402044ab11':
                allfiles.append(os.path.join(datapath, fn))

        for fn in allfiles:
            with open(fn, 'r') as fd:
                body = json.load(fd, encoding='utf-8')

            smod = preprocess_sentence(body['title'])
            ccgbank = grpc.ccg_parse(self.stub, smod, grpc.DEFAULT_SESSION)
            pt = parse_ccg_derivation(ccgbank)
            ccg = process_ccg_pt(pt)

            ccgbody = {}
            ccgbody['story'] = {
                'title': [x.get_json() for x in ccg.get_span()],
                'paragraphs': []
            }
            paragraphs = filter(
                lambda y: len(y) != 0,
                map(lambda x: x.strip(), body['content'].split('\n')))
            i = 0
            for p in paragraphs[i:]:
                sentences = filter(lambda x: len(x.strip()) != 0,
                                   sent_tokenize(p))
                sp = []
                j = 0
                for s in sentences[j:]:
                    dprint('p:s = %d:%d' % (i, j))
                    smod = preprocess_sentence(s)
                    ccgbank = grpc.ccg_parse(self.stub, smod,
                                             grpc.DEFAULT_SESSION)
                    pt = parse_ccg_derivation(ccgbank)
                    ccg = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
                    sp.append([x.get_json() for x in ccg.get_span()])
                    j += 1
                ccgbody['story']['paragraphs'].append(sp)
                i += 1

            msgbody = json.dumps(ccgbody)

        pass

Beispiel #2

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test05_AndOfVerb_AndOfObj(self):
     text = "Bell makes and distributes computers, electronics, and building products"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT
                                 | RT_ATTRIBUTE)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('makes distributes' in phrases)
     self.assertTrue('computers' in phrases)
     self.assertTrue('electronics' in phrases)
     # Note if we add RT_EMPTY_DRS to the selection criteria then this phrase becomes 'and building products'
     self.assertTrue('building products' in phrases)
     self.assertEqual(5, len(phrases))
     verb1 = filter(lambda x: 'makes distributes' == x[1].text,
                    f.iteritems())[0]
     agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0]
     theme1 = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0]
     theme2 = filter(lambda x: 'electronics' == x[1].text, f.iteritems())[0]
     theme3 = filter(lambda x: 'building products' == x[1].text,
                     f.iteritems())[0]
     X1 = agent[0]
     Y1 = theme1[0]
     Y2 = theme2[0]
     Y3 = theme3[0]
     E1 = verb1[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     # TODO: should we add proposition for multi NP's conjoined?
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, Y3])) is not None)

Beispiel #3

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test10_OrOfVerb_OrInBrackets(self):
     text = "That which is perceived or known or inferred to have its own distinct existence (living or nonliving)"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs(nodups=True)
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     # RT_EMPTY_DRS adds 'or' to phrases
     f = sentence.select_phrases(lambda x: x.pos is POS.from_cache('WDT') or \
                                                0 == (x.mask & RT_EMPTY_DRS),
                                 contiguous=False)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('That which' in phrases)
     self.assertTrue('have' in phrases)
     self.assertTrue('is perceived known inferred' in phrases)
     self.assertTrue('its own distinct existence' in phrases)
     verb1 = filter(lambda x: 'is perceived known inferred' == x[1].text,
                    f.iteritems())[0]
     verb2 = filter(lambda x: 'have' == x[1].text, f.iteritems())[0]
     agent = filter(lambda x: 'That which' == x[1].text, f.iteritems())[0]
     theme = filter(lambda x: 'its own distinct existence' == x[1].text,
                    f.iteritems())[0]
     X1 = agent[0]
     E1 = verb1[0]
     E2 = verb2[0]
     X2 = theme[1][0].refs[1]
     X3 = theme[1][1].refs[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, E2])) is not None)
     # TODO: should the theme attach to X2?
     self.assertTrue(d.find_condition(Rel('_ARG1', [E2, X3])) is not None)
     self.assertTrue(d.find_condition(Rel('_POSS', [X2, X3])) is not None)

Beispiel #4

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test01_AndOfSubj(self):
     text = "John and Paul went to the movies"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('John' in phrases)
     self.assertTrue('Paul' in phrases)
     self.assertTrue('went' in phrases)
     john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0]
     paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0]
     went = filter(lambda x: 'went' == x[1].text, f.iteritems())[0]
     J = john[0]
     P = paul[0]
     E = went[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('go', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('John', [J])) is not None)
     self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E, J])) is not None)

Beispiel #5

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test02_AndOfObj(self):
     text = "He saw John and Paul"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('John' in phrases)
     self.assertTrue('Paul' in phrases)
     self.assertTrue('saw' in phrases)
     john = filter(lambda x: 'John' == x[1].text, f.iteritems())[0]
     paul = filter(lambda x: 'Paul' == x[1].text, f.iteritems())[0]
     saw = filter(lambda x: 'saw' == x[1].text, f.iteritems())[0]
     J = john[0]
     P = paul[0]
     E = saw[0]
     # FIXME: wn lemmatizer does not convert saw to see - I guess to to ambiguity
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('saw', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('John', [J])) is not None)
     self.assertTrue(d.find_condition(Rel('Paul', [P])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E, J])) is not None)

Beispiel #6

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test03_OrOfObj(self):
     text = "To participate in games or sport"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_ENTITY | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('participate' in phrases)
     self.assertTrue('games' in phrases)
     self.assertTrue('sport' in phrases)
     noun1 = filter(lambda x: 'games' == x[1].text, f.iteritems())[0]
     noun2 = filter(lambda x: 'sport' == x[1].text, f.iteritems())[0]
     verb = filter(lambda x: 'participate' == x[1].text, f.iteritems())[0]
     X1 = noun1[0]
     X2 = noun2[0]
     E = verb[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('participate', [E])) is not None)
     self.assertTrue(d.find_condition(Rel('games', [X1])) is not None)
     self.assertTrue(d.find_condition(Rel('sport', [X2])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E, X2])) is not None)

Beispiel #7

0

Datei anzeigen

Datei: number_test.py Projekt: marbles-ai/ie

 def test1_Currency_00_0194(self):
     text = r"Without the Cray-3 research and development expenses, the company would have been able to report a profit of $19.3 million for the first half of 1989 rather than the $5.9 million it posted."
     etext = r"Without the Cray-3 research and development expenses , the company would have been able to report a profit of $ 19.3 million for the first half of 1989 rather than the $ 5.9 million it posted"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs(nodups=True)
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('the Cray-3 research and development expenses' in nps)
     self.assertTrue('the company' in nps)
     self.assertTrue('a profit' in nps)
     self.assertTrue('$ 19.3 million' in nps)
     self.assertTrue('the first half' in nps)
     self.assertTrue('the $ 5.9 million' in nps)
     self.assertTrue('1989' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('would have been' in vps)
     self.assertTrue('report' in vps)
     self.assertTrue('posted' in vps)
     would_have_been = filter(lambda x: 'would have been' == x[1].text,
                              fvps)[0][0]
     report = filter(lambda x: 'report' == x[1].text, fvps)[0][0]
     posted = filter(lambda x: 'posted' == x[1].text, fvps)[0][0]
     cray_rnd = filter(
         lambda x: 'the Cray-3 research and development expenses' == x[1].
         text, fnps)[0][0]
     company = filter(lambda x: 'the company' == x[1].text, fnps)[0][0]
     profit = filter(lambda x: 'a profit' == x[1].text, fnps)[0][0]
     first_half = filter(lambda x: 'the first half' == x[1].text,
                         fnps)[0][0]
     n1989 = filter(lambda x: '1989' == x[1].text, fnps)[0][0]
     n19_3M = filter(lambda x: '$ 19.3 million' == x[1].text, fnps)[0][0]
     n5_9M = filter(lambda x: 'the $ 5.9 million' == x[1].text, fnps)[0][0]
     self.assertTrue(
         d.find_condition(Rel('without', [would_have_been, cray_rnd]))
         is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [would_have_been, company]))
         is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [report, company])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [report, profit])) is not None)
     self.assertTrue(
         d.find_condition(Rel('of', [profit, n19_3M])) is not None)
     self.assertTrue(
         d.find_condition(Rel('for', [profit, first_half])) is not None)
     self.assertTrue(
         d.find_condition(Rel('of', [first_half, n1989])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [posted, n5_9M])) is not None)

Beispiel #8

0

Datei anzeigen

Datei: number_test.py Projekt: marbles-ai/ie

 def test2_Date_21_0985(self):
     text = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis: 1989 and Wednesday October 4, 1989."
     etext = r"Annualized interest rates on certain investments as reported by the Federal Reserve Board on a weekly-average basis : 1989 and Wednesday October 4 , 1989"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('Annualized interest rates' in nps)
     self.assertTrue('certain investments' in nps)
     self.assertTrue('the Federal-Reserve-Board' in nps)
     self.assertTrue('a weekly-average basis' in nps)
     self.assertTrue('Wednesday October 4' in nps)

Beispiel #9

0

Datei anzeigen

Datei: number_test.py Projekt: marbles-ai/ie

 def test2_Date_00_1228(self):
     text = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15"
     etext = r"The reduced dividend is payable Jan. 2 to stock of record Dec. 15"
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('The reduced dividend' in nps)
     self.assertTrue('payable' in nps)
     self.assertTrue('Jan. 2' in nps)
     self.assertTrue('Dec. 15' in nps)
     self.assertTrue('stock' in nps)
     self.assertTrue('record' in nps)

Beispiel #10

0

Datei anzeigen

 def test3_ApposInterrupt(self):
     text = r"Robbie, a hot-tempered tennis player, charged the umpire and tried to crack the poor man's skull with a racket."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Robbie' in phrases)
     self.assertTrue('a hot-tempered tennis player' in phrases)
     robbie = filter(lambda x: 'Robbie' == x[1].text, f)[0]
     temper = filter(lambda x: 'a hot-tempered tennis player' == x[1].text, f)[0]
     X = robbie[0]
     Y = temper[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)

Beispiel #11

0

Datei anzeigen

 def test4_ApposInterrupt(self):
     text = r"Bell, a telecommunications company, which is located in Los Angeles, makes and distributes electronics, computers, and building products"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('a telecommunications company' in phrases)
     np1 = filter(lambda x: 'Bell' == x[1].text, f)[0]
     np2 = filter(lambda x: 'a telecommunications company' == x[1].text, f)[0]
     X = np1[0]
     Y = np2[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)

Beispiel #12

0

Datei anzeigen

Datei: number_test.py Projekt: marbles-ai/ie

 def test1_Currency_00_0195(self):
     text = r"On the other hand, had it existed then, Cray Computer would have incurred a $20.5 million loss."
     etext = r"On the other hand , had it existed then , Cray Computer would have incurred a $ 20.5 million loss ."
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     self.assertTrue('the other hand' in nps)
     self.assertTrue('Cray-Computer' in nps)
     self.assertTrue('$ 20.5 million' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('had' in vps)
     self.assertTrue('existed' in vps)
     self.assertTrue('would have incurred' in vps)

Beispiel #13

0

Datei anzeigen

 def test2_ApposInterrupt(self):
     text = r"Reliable, Diane's eleven-year-old beagle, chews holes in the living room carpeting as if he were still a puppy."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET|CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Reliable' in phrases)
     self.assertTrue("eleven-year-old beagle" in phrases)
     self.assertTrue("Diane" in phrases)
     dog = filter(lambda x: 'Reliable' == x[1].text, f)[0]
     breed = filter(lambda x: "eleven-year-old beagle" == x[1].text, f)[0]
     X = dog[0]
     Y = breed[0]
     self.assertNotEqual(X, Y)
     self.assertTrue(d.find_condition(Rel('_AKA', [X, Y])) is not None)
     self.assertTrue(len(repr(d).split('_AKA')) == 2)

Beispiel #14

0

Datei anzeigen

 def test10_Ccgbank_00_0036(self):
     text = "Average maturity of the funds' investments lengthened by a day to 41 days, the longest since early August, according to Donoghue's."
     etext = "Average maturity of the funds ' investments lengthened by a day to 41 days , the longest since early August , according to Donoghue 's ."
     mtext = preprocess_sentence(text)
     self.assertEqual(etext, mtext)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     fnps = sentence.get_np_nominals()
     nps = [sp.text for r, sp in fnps]
     #self.assertTrue('Average maturity' in nps)
     self.assertTrue('the funds' in nps)
     self.assertTrue('a day' in nps)
     self.assertTrue('41 days' in nps)
     self.assertTrue('the longest' in nps)
     self.assertTrue('early August' in nps)
     fvps = sentence.get_vp_nominals()
     vps = [sp.text for r, sp in fvps]
     self.assertTrue('lengthened' in vps)
     self.assertTrue('according' in vps)

Beispiel #15

0

Datei anzeigen

Datei: conj_test.py Projekt: marbles-ai/ie

 def test04_AndOfVerb(self):
     text = "Bell makes and distributes computers"
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.select_phrases(RT_PROPERNAME | RT_ENTITY | RT_EVENT)
     phrases = [sp.text for r, sp in f.iteritems()]
     self.assertTrue('Bell' in phrases)
     self.assertTrue('makes distributes' in phrases)
     self.assertTrue('computers' in phrases)
     verb1 = filter(lambda x: 'makes distributes' == x[1].text,
                    f.iteritems())[0]
     agent = filter(lambda x: 'Bell' == x[1].text, f.iteritems())[0]
     theme = filter(lambda x: 'computers' == x[1].text, f.iteritems())[0]
     X1 = agent[0]
     X2 = theme[0]
     E1 = verb1[0]
     self.assertTrue(d.find_condition(Rel('_EVENT', [E1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG0', [E1, X1])) is not None)
     self.assertTrue(d.find_condition(Rel('_ARG1', [E1, X2])) is not None)

Beispiel #16

0

Datei anzeigen

Datei: infox.py Projekt: marbles-ai/ie

    def parse(self, request, context):
        """Parse a message."""
        retry = 3
        while retry:
            if self.state.terminate:
                context.set_code(grpc.StatusCode.CANCELLED)
                context.set_details('Application terminating.')
                raise RuntimeError('Application terminating!')

            try:
                # EasyXXX does not handle these
                smod = preprocess_sentence(request.text)
                ccgbank = gsvc.ccg_parse(self.ccg_stub, smod,
                                         gsvc.DEFAULT_SESSION)
                pt = parse_ccg_derivation(ccgbank)
                ccg = process_ccg_pt(pt, options=request.options)
                sent = ccg.get_verbnet_sentence()

                response = infox_service_pb2.GSentence()
                for lex in sent:
                    glex = response.lexemes.add()
                    glex.head = lex.head
                    glex.idx = lex.idx
                    glex.mask = lex.mask
                    for r in lex.refs:
                        glex.refs.append(r.var.to_string())
                    glex.pos = lex.pos.tag
                    glex.word = lex.word
                    glex.stem = lex.stem
                    glex.category = lex.category.signature
                    if lex.wiki_data is not None:
                        glex.wikidata.title = lex.wiki_data.title
                        glex.wikidata.summary = lex.wiki_data.summary
                        glex.wikidata.page_categories.extend(
                            lex.wiki_data.page_categories)
                        glex.wikidata.url = lex.wiki_data.url

                for c in ccg.constituents:
                    gc = response.constituents.add()
                    gc.span.extend(c.span.get_indexes())
                    gc.vntype = c.vntype.signature
                    gc.head = c.chead

                return response

            except requests.exceptions.ConnectionError as e:
                self.state.wait(0.25)
                retry -= 1
                self.logger.exception('Infox.parse', exc_info=e)
                context.set_code(grpc.StatusCode.ABORTED)
                context.set_details(e.message)
                raise

            except Exception as e:
                retry = 0
                self.logger.exception('Infox.parse', exc_info=e)
                context.set_code(grpc.StatusCode.ABORTED)
                context.set_details(e.message)
                raise

        context.set_code(grpc.StatusCode.ABORTED)
        context.set_details('Too many retries!')
        raise RuntimeError('Too many retries!')

Beispiel #17

0

Datei anzeigen

Datei: __init__.py Projekt: marbles-ai/ie

    def run(self):
        """Process messages."""
        for message in receive_messages(self.aws.news_queue, MessageAttributeNames=['All']):
            global _logger
            # Attributes will be passed onto next queue
            attributes = message.message_attributes
            mhash = attributes['hash']['StringValue']
            _logger.debug('Received news_queue(%s) -> hash(%s)', message.message_id, mhash)
            body = json.loads(message.body)
            retry = 3
            ccgbank = None
            title = body['title']
            paragraphs_in = filter(lambda y: len(y) != 0, map(lambda x: x.strip(), body['content'].split('\n')))
            paragraphs_out = []
            if len(paragraphs_in) == 0:
                _logger.debug('No paragraphs for story %s\n%s', (mhash, title))
            # Use NLTK to split paragraphs into sentences.
            for p in paragraphs_in:
                sentences = filter(lambda x: len(x.strip()) != 0, sent_tokenize(p))
                paragraphs_out.append(sentences)

            if self.state.terminate:
                break

            result = {}
            result['title'] = {}
            while retry:
                try:
                    ccgbank = grpc.ccg_parse(self.aws.stub, title, grpc.DEFAULT_SESSION)
                    pt = parse_ccg_derivation(ccgbank)
                    ccg = process_ccg_pt(pt, options=self.options)
                    result['title']['lexemes'] = [x.get_json() for x in ccg.get_span()]
                    result['title']['constituents'] = [c.get_json() for c in ccg.constituents]
                    ccgpara = []
                    result['paragraphs'] = ccgpara
                    for sentences in paragraphs_out:
                        ccgsent = []
                        ccgpara.append(ccgsent)
                        for s in sentences:
                            smod = preprocess_sentence(s)
                            ccgbank = grpc.ccg_parse(self.aws.stub, smod, grpc.DEFAULT_SESSION)
                            pt = parse_ccg_derivation(ccgbank)
                            ccg = process_ccg_pt(pt, options=self.options)
                            ccgentry = {}
                            ccgentry['lexemes'] = [x.get_json() for x in ccg.get_span()]
                            ccgentry['constituents'] = [c.get_json() for c in ccg.constituents]
                            ccgsent.append(ccgentry)
                    break   # exit while
                except requests.exceptions.ConnectionError as e:
                    time.sleep(0.25)
                    retry -= 1
                    _logger.exception('AwsNewsQueueReader.run', exc_info=e)
                    if self.state.pass_on_exceptions:
                        raise
                except Exception as e:
                    # After X reads AWS sends the item to the dead letter queue.
                    # X is configurable in AWS console.
                    retry = 0
                    _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash)
                    if self.state.pass_on_exceptions:
                        raise

                if self.state.terminate:
                    retry = 0
                    break

            # retry == 0 indicates failure
            if retry == 0:
                continue


            try:
                # Let the queue know that the message is processed
                message.delete()
                if self.aws.ccg_queue:
                    ireduce = -1
                    iorig = len(result['paragraphs'])

                    while True:
                        strm = StringIO.StringIO()
                        # Add indent so easier to debug
                        json.dump(result, strm, indent=2)
                        data = strm.getvalue()
                        if len(data) >= 200*1024:
                            para = result['paragraphs']
                            ireduce = max([1, (len(para) * 200 * 1024)/ len(data)])
                            ireduce = min([len(para)-1, ireduce])
                            result['paragraphs'] = para[0:ireduce]
                        else:
                            break

                        if len(result['paragraphs']) <= 1:
                            break

                    if ireduce >= 0:
                        _logger.warning('Hash(%s) ccg paragraphs reduced from %d to %d' % (mhash, iorig, ireduce))
                    response = self.aws.ccg_queue.send_message(MessageAttributes=attributes, MessageBody=data)
                    _logger.debug('Sent hash(%s) -> ccg_queue(%s)', mhash, response['MessageId'])
            except Exception as e:
                _logger.exception('AwsNewsQueueReader.run', exc_info=e, rlimitby=mhash)
                if self.state.pass_on_exceptions:
                    raise

Beispiel #18

0

Datei anzeigen

Datei: process_text.py Projekt: marbles-ai/ie

    try:
        sessionId = grpc.DEFAULT_SESSION
        if options.ofmt is not None:
            if options.ofmt not in [
                    'ccgbank', 'html', 'logic', 'extended', 'drs'
            ]:
                die('bad output format %s, must be ccgbank|html|logic|extended'
                    % options.ofmt)
            # Create a session to match output format, default is CCGBANK
            if options.ofmt != 'ccgbank' and options.ofmt != 'drs':
                sessionId = grpc.create_session(stub, options.ofmt)

        titleSrch = re.compile(titleRe)
        if not options.book:
            line = preprocess_sentence(' '.join(args))
            html = None
            # FIXME: Convert to python 3. Unicode is default.
            ccg = grpc.ccg_parse(stub, line, sessionId)
            if options.ofmt == 'html':
                html = ccg
                ccg = None
            drs = None
            pccg = None
            fol = None
            constituents = None
            orphaned = None
            conjoins = None
            functor_phrases = None
            vnconstituents = ''
            constituents = ''

Beispiel #19

0

Datei anzeigen

 def test10_Ccgbank_00_0099(self):
     text = "Plans that give advertisers discounts for maintaining or increasing ad spending have become permanent fixtures at the news weeklies and underscore the fierce competition between Newsweek, Time Warner Inc.'s Time magazine, and Mortimer B. Zuckerman's U.S. News & World Report."
     mtext = preprocess_sentence(text)
     derivation = grpc.ccg_parse(self.stub, mtext, grpc.DEFAULT_SESSION)
     pt = parse_ccg_derivation(derivation)
     sentence = process_ccg_pt(pt, CO_NO_VERBNET | CO_NO_WIKI_SEARCH)
     d = sentence.get_drs()
     dprint(pt_to_ccg_derivation(pt))
     dprint(d)
     f = sentence.get_np_nominals()
     phrases = [sp.text for r, sp in f]
     self.assertTrue('Plans' in phrases)
     self.assertTrue('advertisers' in phrases)
     self.assertTrue('discounts' in phrases)
     self.assertTrue('ad spending' in phrases)
     self.assertTrue('permanent fixtures' in phrases)
     self.assertTrue('the news weeklies' in phrases)
     self.assertTrue('the fierce competition' in phrases)
     self.assertTrue("Newsweek" in phrases)
     self.assertTrue("Time-Warner-Inc." in phrases)
     self.assertTrue("Time-magazine" in phrases)
     self.assertTrue("Mortimer-B.-Zuckerman" in phrases)
     self.assertTrue("U.S.-News-&-World-Report" in phrases)
     vf = sentence.get_vp_nominals()
     vphrases = [sp.text for r, sp in vf]
     self.assertTrue('give' in vphrases)
     self.assertTrue('maintaining increasing' in vphrases)
     self.assertTrue('have become' in vphrases)
     self.assertTrue('underscore' in vphrases)
     give = filter(lambda x: 'give' == x[1].text, vf)[0][0]
     become = filter(lambda x: 'have become' == x[1].text, vf)[0][0]
     uscore = filter(lambda x: 'underscore' == x[1].text, vf)[0][0]
     minc = filter(lambda x: 'maintaining increasing' == x[1].text,
                   vf)[0][0]
     plans = filter(lambda x: 'Plans' == x[1].text, f)[0][0]
     advertisers = filter(lambda x: 'advertisers' == x[1].text, f)[0][0]
     discounts = filter(lambda x: 'discounts' == x[1].text, f)[0][0]
     spending = filter(lambda x: 'ad spending' == x[1].text, f)[0][0]
     fixtures = filter(lambda x: 'permanent fixtures' == x[1].text, f)[0][0]
     weeklies = filter(lambda x: 'the news weeklies' == x[1].text, f)[0][0]
     timeinc = filter(lambda x: 'Time-Warner-Inc.' == x[1].text, f)[0][0]
     timemag = filter(lambda x: 'Time-magazine' == x[1].text, f)[0][0]
     mortimer = filter(lambda x: 'Mortimer-B.-Zuckerman' == x[1].text,
                       f)[0][0]
     uswr = filter(lambda x: 'U.S.-News-&-World-Report' == x[1].text,
                   f)[0][0]
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [give, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [give, advertisers])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG2', [give, discounts])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [minc, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [minc, spending])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG0', [become, plans])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_ARG1', [become, fixtures])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_POSS', [mortimer, uswr])) is not None)
     self.assertTrue(
         d.find_condition(Rel('_POSS', [timeinc, timemag])) is not None)