def test_online_coref(self): # Test text, sents, tokens input # utterances = [ # {'speaker_id': 1, 'text': 'I read an article today. It is about US politics.'}, # {'speaker_id': 2, 'tokens': [['What', 'does', 'it', 'say', 'about', 'US', 'politics', '?']]}, # Tokens # {'speaker_id': 1, 'text': 'It talks about the US presidential election.'}, # {'speaker_id': 2, 'text': ['I am interested to hear.', 'Can you elaborate more?']}, # Sents # {'speaker_id': 1, 'text': 'Sure! The presidential election is indeed interesting.'} # ] utterances = [{ 'speaker_id': 1, 'tokens': [['I', 'read', 'an', 'article', 'today', '.']] }, { 'speaker_id': 2, 'tokens': [['Can', 'you', 'tell', 'me', 'what', 'it', 'is', 'about', '?']] }] context = None tokens_to_date = [] for turn, uttr in enumerate(utterances): if 'tokens' in uttr: input_doc = Input( tokens=uttr['tokens'], speaker_ids=uttr['speaker_id'], coref_context=self.convert_output_to_context(context), models=['ocr']) else: input_doc = Input( text=uttr['text'], speaker_ids=uttr['speaker_id'], coref_context=self.convert_output_to_context(context), models=['ocr']) output_doc = en_services.online_coref.predict_sequentially( input_doc, check_sanitization=True) print(json.dumps(output_doc)) context = output_doc['ocr'] # Print cluster text tokens_to_date += flatten(input_doc.tokens) for cluster in output_doc['ocr']['clusters']: for i in range(len(cluster)): m1, m2 = tuple(cluster[i]) cluster[i] = (m1, m2, ' '.join(tokens_to_date[m1:m2])) print(f'cluster text: {output_doc["ocr"]["clusters"]}') print()
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser( description='ELIT-{}'.format(__version__)) task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') parse_parser = task_parser.add_parser( name='parse', help='interactive parse per document') server_parser = task_parser.add_parser( name='serve', help='start http server', description='A http server for ELIT') server_parser.add_argument('--port', type=int, default=8000) server_parser.add_argument('--workers', type=int, default=1, help='number of workers') args = arg_parser.parse_args() if args.task == 'parse': from elit.server.en_parser import service_parser for line in sys.stdin: line = line.strip() doc = service_parser.parse([Input(text=line)])[0] print(doc) elif args.task == 'serve': from elit.server import server server.run(port=args.port, workers=args.workers)
def test_sents_input(self): text = [ "Emory NLP is a research lab in Atlanta, GA.", "It is founded by Jinho D. Choi in 2014.", 'Dr. Choi is a professor at Emory University.' ] doc = en_services.parser.parse([Input(text=text)])[0] print(doc)
def test_doc_coref_sents(self): text = [ "Emory NLP is a research lab in Atlanta, GA.", "It is founded by Jinho D. Choi in 2014.", 'Dr. Choi is a professor at Emory University.' ] input_doc = Input(text=text, models=['dcr']) print(en_services.doc_coref.predict(input_doc))
def main(): text = [ "Emory NLP is a research lab in Atlanta, GA. " "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University." ] input = Input(text=text) input.models = ['lem'] docs = en_services.parser.parse([input]) for doc in docs: print(doc) # See elit.client for coreference examples text = 'Pfizer said last week it may need the U.S. government to help it secure some components needed to ' \ 'make the vaccine. While the company halved its 2020 production target due to manufacturing issues, ' \ 'it said last week its manufacturing is running smoothly now. The government also has the option to ' \ 'acquire up to an additional 400 million doses of the vaccine.' input_doc = Input(text=text, models=['dcr']) doc = service_doc_coref.predict(input_doc) print(doc)
def test_doc_coref_tokens(self): tokens = [[ "Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "." ], [ "It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", ".", "Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "." ]] input_doc = Input(tokens=tokens, models=['dcr']) print(en_services.doc_coref.predict(input_doc))
def test_tokens_input(self): tokens = [[ "Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "." ], [ "It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", "." ], [ "Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "." ]] doc = en_services.parser.parse([Input(tokens=tokens)])[0] print(doc)
def test_doc_coref_concurrent(self): batch_size = 32 inputs = [Input(text=self.get_sample_text(), models=['dcr']) ] * batch_size start_time = time.time() docs = en_services.doc_coref.predict(inputs) end_time = time.time() print( f'Concurrent doc coref time elapse for {batch_size} small documents: {end_time - start_time :.2f}s' ) assert len(docs) == len(inputs) print(docs[0]) print(docs[-1])
def test_tokens_input(self): tokens = [ "yes i do what 's your job".split(), ] doc = en_services.parser.parse([Input(tokens=tokens)])[0] print(doc)
async def parse(text: str): input = Input(text=text) output: Document = await runner.process_input(input) if not isinstance(output, Document): raise HandlingError("Internal Server Error", code=500) return output.to_dict()