def decode(cls): parser = argparse.ArgumentParser( description='Use a pos tagger to decode raw text') parser.add_argument('--model_path', type=str, default=ELIT_POS_FLAIR_EN_MIXED, help='file path to the saved model') args = None try: args = parser.parse_args(sys.argv[3:]) except SystemExit: parser.print_help() exit(1) tagger = POSFlairTagger() tagger.load(args.model_path) components = [EnglishTokenizer(), tagger] for line in sys.stdin: line = line.strip() docs = line for c in components: docs = c.decode(docs) for d in docs: # type: Document for sent in d: # type: Sentence print(' '.join('{}/{}'.format(word, pos) for word, pos in zip( sent.tokens, sent.part_of_speech_tags)))
def decode(cls): parser = argparse.ArgumentParser( description='Use a semantic dependency parser to decode raw text') parser.add_argument('--model_path', type=str, default=ELIT_SDP_BIAFFINE_EN_MIXED, help='file path to the saved model') args = None try: args = parser.parse_args(sys.argv[3:]) except SystemExit: parser.print_help() exit(1) this_module = SDPParser() this_module.load(args.model_path) pos_tagger = POSFlairTagger() pos_tagger.load(ELIT_POS_FLAIR_EN_MIXED) components = [EnglishTokenizer(), pos_tagger, this_module] for line in sys.stdin: line = line.strip() if not line: continue docs = line for c in components: docs = c.decode(docs) for d in docs: # type: Document for sent in d.to_conll(): print(sent)
def convert_raw_mention_to_lemma_elit_batch_version( raw_mentions: list) -> list: global ELIT_LEMA_COMPONENTS if len(ELIT_LEMA_COMPONENTS) <= 0: tokenizer = EnglishTokenizer() pos_tagger = POSFlairTagger() pos_tagger.load() lemmatizer = EnglishLemmatizer() ELIT_LEMA_COMPONENTS = [tokenizer, pos_tagger, lemmatizer] print('ELIT Lemmatization Components Load Done') components = ELIT_LEMA_COMPONENTS docs = [m.replace("’s", "'s").replace('—', '-') for m in raw_mentions] for c in components[:-1]: docs = c.decode(docs) #print(docs) # deal with special cases: one `JJ` token as the raw_mention should be `VBN` event trigger for doc in docs: for sent in doc['sens']: if len(sent['tok']) == 1 and sent['pos'][0] == 'JJ': sent['pos'][0] = 'VBN' docs = components[-1].decode(docs) #print(docs) lemmas_list = [] for doc in docs: lemmas = [] for _ in doc['sens']: lemmas = convert_processed_mention_to_lemma_elit_version( _['tok'], _['pos'], _['lem']) lemmas_list.append(lemmas) return lemmas_list
def convert_raw_mention_to_lemma_elit_version(raw_mention: str) -> list: global ELIT_LEMA_COMPONENTS if len(ELIT_LEMA_COMPONENTS) <= 0: tokenizer = EnglishTokenizer() pos_tagger = POSFlairTagger() pos_tagger.load() lemmatizer = EnglishLemmatizer() ELIT_LEMA_COMPONENTS = [tokenizer, pos_tagger, lemmatizer] print('ELIT Lemmatization Components Load Done') components = ELIT_LEMA_COMPONENTS raw_mention = raw_mention.replace("’s", "'s").replace('—', '-') doc = raw_mention for c in components[:-1]: doc = c.decode(doc) lemmas = [] # deal with special cases: one `JJ` token as the raw_mention should be `VBN` event trigger for _ in doc[0]['sens']: if len(_['tok']) == 1 and _['pos'][0] == 'JJ': _['pos'][0] = 'VBN' doc = components[-1].decode(doc) elit_sent_res = doc[0]['sens'][0] lemmas_post = convert_processed_mention_to_lemma_elit_version( elit_sent_res['tok'], elit_sent_res['pos'], elit_sent_res['lem']) return lemmas_post
def produce_elit_dep_trees(input_file, output_file): parser = DEPBiaffineParser() parser.load() pos_tagger = POSFlairTagger() pos_tagger.load() print('ELIT components load DONE') components = [EnglishTokenizer(), pos_tagger, parser] result_to_disk = [] with open(input_file, encoding='utf-8') as fopen: for line in tqdm.tqdm(fopen): docs = [line.strip()] for c in components: docs = c.decode(docs) # docs[]['sens'][]['dep'] contains np.int64 for doc in docs: for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] result_to_disk.extend(docs) json.dump(result_to_disk, open(output_file, 'w', encoding='utf-8'))
def produce_elit_dep_trees_for_nyt_corpus_abstract(corpus_dir, output_dir): parser = DEPBiaffineParser() parser.load() pos_tagger = POSFlairTagger() pos_tagger.load() lemmatizer = EnglishLemmatizer() components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser] print('ELIT components load DONE') for dir_year in os.listdir(corpus_dir): target_path = '%s/%s.jsonlines' % (output_dir, dir_year) print('Now dumping result to %s...' % (target_path)) fwrite = open(target_path, 'w', encoding='utf-8') for file_doc in tqdm.tqdm( os.listdir(os.path.join(corpus_dir, dir_year))): # parse xml file cur_path = os.path.join(corpus_dir, dir_year, file_doc) xml_tree = ET.parse(cur_path) root = xml_tree.getroot() docid = file_doc.split('.')[0].zfill(7) abstract_dom = root.find('./body/body.head/abstract/p') abstract_cleaned = clean_nyt_abstract(abstract_dom.text) # elit to produce dep tree file_res = {'abstract': abstract_cleaned, 'doc_id': docid} if not len(abstract_cleaned) <= 0: docs = abstract_cleaned for c in components: docs = c.decode(docs) for doc in docs: doc['para_id'] = doc['doc_id'] # para_id=0 means headline doc.pop('doc_id') for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] file_res['elit_res'] = docs else: file_res['elit_res'] = [] fwrite.write(json.dumps(file_res) + '\n') fwrite.close()
def decode(cls): parser = argparse.ArgumentParser( description='Use a NER tagger to decode raw text') parser.add_argument('--model_path', type=str, default=ELIT_NER_FLAIR_EN_ONTONOTES, help='file path to the saved model') args = None try: args = parser.parse_args(sys.argv[3:]) except SystemExit: parser.print_help() exit(1) tagger = NERFlairTagger() tagger.load(args.model_path) components = [EnglishTokenizer(), tagger] for line in sys.stdin: line = line.strip() docs = line for c in components: docs = c.decode(docs) for d in docs: # type: Document for sent in d: # type: Sentence print(sent[NER])
def english_tokenizer(): return EnglishTokenizer()
# from elit.component import EnglishTokenizer from elit.component.tokenizer import EnglishTokenizer # from elit.component import POSFlairTagger from elit.component import POSTagger texts = [ 'Emory University is a private research university in Atlanta, in the U.S. state of Georgia. The university was founded as Emory College in 1836 in Oxford, Georgia, by the Methodist Episcopal Church and was named in honor of Methodist bishop John Emory.', 'In 1915, Emory College moved to its present location in Druid Hills and was rechartered as Emory University. Emory maintained a presence in Oxford that eventually became Oxford College, a residential liberal arts college for the first two years of the Emory baccalaureate degree.[19] The university is the second-oldest private institution of higher education in Georgia and among the fifty oldest private universities in the United States.'] tok = EnglishTokenizer() pos = POSTagger() docs = [tok.decode(text) for text in texts] components = [pos] for component in components: docs = component.decode(docs) print(docs)
def produce_elit_dep_trees_for_SemanticScholar(input_path: str, output_path: str) -> None: print('[%s] Start process %s into %s' % (time.ctime(), input_path, output_path)) parser = DEPBiaffineParser() parser.load() pos_tagger = POSFlairTagger() pos_tagger.load() lemmatizer = EnglishLemmatizer() components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser] print('ELIT components load DONE') with open(input_path) as fopen, open(output_path, 'w') as fwrite: for line in tqdm.tqdm(fopen): line_dict = json.loads(line.strip()) docno = line_dict['docno'] paper_abstract = line_dict['paperAbstract'] title = line_dict['title'] file_res = { 'doc_id': docno, 'paper_abstract': paper_abstract, 'title': title } # process title and abstract file_res['title_elit_res'] = [] file_res['paper_abstract_elit_res'] = [] # to save processing time if len(paper_abstract) > 0 and len(title) > 0: docs = [title, paper_abstract] for c in components: docs = c.decode(docs) for doc in docs: doc['para_id'] = doc['doc_id'] # para_id=0 means headline doc.pop('doc_id') for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] if doc['para_id'] == 0: file_res['title_elit_res'].append(doc) else: file_res['paper_abstract_elit_res'].append(doc) elif len(paper_abstract) > 0: docs = paper_abstract for c in components: docs = c.decode(docs) for doc in docs: doc['para_id'] = doc['doc_id'] # para_id=0 means headline doc.pop('doc_id') for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] file_res['paper_abstract_elit_res'] = docs elif len(title) > 0: docs = title for c in components: docs = c.decode(docs) for doc in docs: doc['para_id'] = doc['doc_id'] # para_id=0 means headline doc.pop('doc_id') for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] file_res['title_elit_res'] = docs fwrite.write(json.dumps(file_res) + '\n') fwrite.close()
def produce_elit_dep_trees_for_nyt_corpus(corpus_dir, output_dir): parser = DEPBiaffineParser() parser.load() pos_tagger = POSFlairTagger() pos_tagger.load() lemmatizer = EnglishLemmatizer() components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser] print('ELIT components load DONE') #for dir_year in os.listdir(corpus_dir): #for dir_year in ['2004', '2005', '2006', '2007']: #for dir_year in ['2000', '2001', '2002', '2003']: #for dir_year in ['1997', '1998', '1999']: for dir_year in ['2003']: target_path = '%s/%s.json' % (output_dir, dir_year) print('Now dumping result to %s...' % (target_path)) fwrite = open(target_path, 'w', encoding='utf-8') for file_doc in tqdm.tqdm( os.listdir(os.path.join(corpus_dir, dir_year))): empty_flag = True # parse xml file cur_path = os.path.join(corpus_dir, dir_year, file_doc) xml_tree = ET.parse(cur_path) root = xml_tree.getroot() docid = file_doc.split('.')[0] headline_node = root.find('./body/body.head/hedline/hl1') if headline_node != None: empty_flag = False headline = headline_node.text else: headline = '' content = [] full_text_node = root.find( "./body/body.content/block[@class='full_text']") if full_text_node != None: empty_flag = False for para in full_text_node.findall('p'): content.append(para.text) # elit to produce dep tree file_res = { 'headline': headline, 'content': content, 'doc_id': docid } if not empty_flag: docs = [headline] + content try: for c in components: docs = c.decode(docs) for doc in docs: doc['para_id'] = doc[ 'doc_id'] # para_id=0 means headline doc.pop('doc_id') for sent_dict in doc['sens']: sent_dict['dep'] = [(int(_[0]), _[1]) for _ in sent_dict['dep']] file_res['elit_res'] = docs except mxnet.base.MXNetError: file_res['elit_res'] = [] print('%s encounter error when using ELIT parsing' % (docid)) else: file_res['elit_res'] = [] fwrite.write(json.dumps(file_res) + '\n') fwrite.close()
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-05-28 17:38 from elit.component import NERFlairTagger from elit.component.tokenizer import EnglishTokenizer from elit.structure import Document tagger = NERFlairTagger() tagger.load() components = [EnglishTokenizer(), tagger] docs = 'buy Apple TV' for c in components: docs = c.decode(docs) for d in docs: # type: Document print(d)
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-05-30 18:24 from elit.component import POSFlairTagger, SDPBiaffineParser from elit.component.tokenizer import EnglishTokenizer parser = SDPBiaffineParser() parser.load() pos_tagger = POSFlairTagger() pos_tagger.load() components = [EnglishTokenizer(), pos_tagger, parser] docs = 'Is this the future of chamber music ?' for c in components: docs = c.decode(docs) for d in docs: # type: Document for sent in d.to_conll(): print(sent) print(d)