Exemple #1
0
 def decode(cls):
     parser = argparse.ArgumentParser(
         description='Use a pos tagger to decode raw text')
     parser.add_argument('--model_path',
                         type=str,
                         default=ELIT_POS_FLAIR_EN_MIXED,
                         help='file path to the saved model')
     args = None
     try:
         args = parser.parse_args(sys.argv[3:])
     except SystemExit:
         parser.print_help()
         exit(1)
     tagger = POSFlairTagger()
     tagger.load(args.model_path)
     components = [EnglishTokenizer(), tagger]
     for line in sys.stdin:
         line = line.strip()
         docs = line
         for c in components:
             docs = c.decode(docs)
         for d in docs:  # type: Document
             for sent in d:  # type: Sentence
                 print(' '.join('{}/{}'.format(word, pos)
                                for word, pos in zip(
                                    sent.tokens, sent.part_of_speech_tags)))
Exemple #2
0
    def decode(cls):
        parser = argparse.ArgumentParser(
            description='Use a semantic dependency parser to decode raw text')
        parser.add_argument('--model_path',
                            type=str,
                            default=ELIT_SDP_BIAFFINE_EN_MIXED,
                            help='file path to the saved model')
        args = None
        try:
            args = parser.parse_args(sys.argv[3:])
        except SystemExit:
            parser.print_help()
            exit(1)

        this_module = SDPParser()
        this_module.load(args.model_path)
        pos_tagger = POSFlairTagger()
        pos_tagger.load(ELIT_POS_FLAIR_EN_MIXED)
        components = [EnglishTokenizer(), pos_tagger, this_module]
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue
            docs = line
            for c in components:
                docs = c.decode(docs)
            for d in docs:  # type: Document
                for sent in d.to_conll():
                    print(sent)
def convert_raw_mention_to_lemma_elit_batch_version(
        raw_mentions: list) -> list:
    global ELIT_LEMA_COMPONENTS
    if len(ELIT_LEMA_COMPONENTS) <= 0:
        tokenizer = EnglishTokenizer()
        pos_tagger = POSFlairTagger()
        pos_tagger.load()
        lemmatizer = EnglishLemmatizer()
        ELIT_LEMA_COMPONENTS = [tokenizer, pos_tagger, lemmatizer]
        print('ELIT Lemmatization Components Load Done')
    components = ELIT_LEMA_COMPONENTS
    docs = [m.replace("’s", "'s").replace('—', '-') for m in raw_mentions]
    for c in components[:-1]:
        docs = c.decode(docs)
    #print(docs)
    # deal with special cases: one `JJ` token as the raw_mention should be `VBN` event trigger
    for doc in docs:
        for sent in doc['sens']:
            if len(sent['tok']) == 1 and sent['pos'][0] == 'JJ':
                sent['pos'][0] = 'VBN'
    docs = components[-1].decode(docs)
    #print(docs)
    lemmas_list = []
    for doc in docs:
        lemmas = []
        for _ in doc['sens']:
            lemmas = convert_processed_mention_to_lemma_elit_version(
                _['tok'], _['pos'], _['lem'])
            lemmas_list.append(lemmas)
    return lemmas_list
def convert_raw_mention_to_lemma_elit_version(raw_mention: str) -> list:
    global ELIT_LEMA_COMPONENTS
    if len(ELIT_LEMA_COMPONENTS) <= 0:
        tokenizer = EnglishTokenizer()
        pos_tagger = POSFlairTagger()
        pos_tagger.load()
        lemmatizer = EnglishLemmatizer()
        ELIT_LEMA_COMPONENTS = [tokenizer, pos_tagger, lemmatizer]
        print('ELIT Lemmatization Components Load Done')
    components = ELIT_LEMA_COMPONENTS
    raw_mention = raw_mention.replace("’s", "'s").replace('—', '-')
    doc = raw_mention
    for c in components[:-1]:
        doc = c.decode(doc)

    lemmas = []
    # deal with special cases: one `JJ` token as the raw_mention should be `VBN` event trigger
    for _ in doc[0]['sens']:
        if len(_['tok']) == 1 and _['pos'][0] == 'JJ':
            _['pos'][0] = 'VBN'
    doc = components[-1].decode(doc)
    elit_sent_res = doc[0]['sens'][0]
    lemmas_post = convert_processed_mention_to_lemma_elit_version(
        elit_sent_res['tok'], elit_sent_res['pos'], elit_sent_res['lem'])
    return lemmas_post
def produce_elit_dep_trees(input_file, output_file):
    parser = DEPBiaffineParser()
    parser.load()
    pos_tagger = POSFlairTagger()
    pos_tagger.load()
    print('ELIT components load DONE')
    components = [EnglishTokenizer(), pos_tagger, parser]
    result_to_disk = []
    with open(input_file, encoding='utf-8') as fopen:
        for line in tqdm.tqdm(fopen):
            docs = [line.strip()]
            for c in components:
                docs = c.decode(docs)
            # docs[]['sens'][]['dep'] contains np.int64
            for doc in docs:
                for sent_dict in doc['sens']:
                    sent_dict['dep'] = [(int(_[0]), _[1])
                                        for _ in sent_dict['dep']]
            result_to_disk.extend(docs)

    json.dump(result_to_disk, open(output_file, 'w', encoding='utf-8'))
def produce_elit_dep_trees_for_nyt_corpus_abstract(corpus_dir, output_dir):
    parser = DEPBiaffineParser()
    parser.load()
    pos_tagger = POSFlairTagger()
    pos_tagger.load()
    lemmatizer = EnglishLemmatizer()
    components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser]
    print('ELIT components load DONE')

    for dir_year in os.listdir(corpus_dir):
        target_path = '%s/%s.jsonlines' % (output_dir, dir_year)
        print('Now dumping result to %s...' % (target_path))
        fwrite = open(target_path, 'w', encoding='utf-8')
        for file_doc in tqdm.tqdm(
                os.listdir(os.path.join(corpus_dir, dir_year))):
            # parse xml file
            cur_path = os.path.join(corpus_dir, dir_year, file_doc)
            xml_tree = ET.parse(cur_path)
            root = xml_tree.getroot()
            docid = file_doc.split('.')[0].zfill(7)
            abstract_dom = root.find('./body/body.head/abstract/p')
            abstract_cleaned = clean_nyt_abstract(abstract_dom.text)
            # elit to produce dep tree
            file_res = {'abstract': abstract_cleaned, 'doc_id': docid}
            if not len(abstract_cleaned) <= 0:
                docs = abstract_cleaned
                for c in components:
                    docs = c.decode(docs)
                for doc in docs:
                    doc['para_id'] = doc['doc_id']  # para_id=0 means headline
                    doc.pop('doc_id')
                    for sent_dict in doc['sens']:
                        sent_dict['dep'] = [(int(_[0]), _[1])
                                            for _ in sent_dict['dep']]
                file_res['elit_res'] = docs
            else:
                file_res['elit_res'] = []
            fwrite.write(json.dumps(file_res) + '\n')
        fwrite.close()
Exemple #7
0
 def decode(cls):
     parser = argparse.ArgumentParser(
         description='Use a NER tagger to decode raw text')
     parser.add_argument('--model_path',
                         type=str,
                         default=ELIT_NER_FLAIR_EN_ONTONOTES,
                         help='file path to the saved model')
     args = None
     try:
         args = parser.parse_args(sys.argv[3:])
     except SystemExit:
         parser.print_help()
         exit(1)
     tagger = NERFlairTagger()
     tagger.load(args.model_path)
     components = [EnglishTokenizer(), tagger]
     for line in sys.stdin:
         line = line.strip()
         docs = line
         for c in components:
             docs = c.decode(docs)
         for d in docs:  # type: Document
             for sent in d:  # type: Sentence
                 print(sent[NER])
Exemple #8
0
def english_tokenizer():
    return EnglishTokenizer()
Exemple #9
0
# from elit.component import EnglishTokenizer
from elit.component.tokenizer import EnglishTokenizer
# from elit.component import POSFlairTagger
from elit.component import POSTagger

texts = [
    'Emory University is a private research university in Atlanta, in the U.S. state of Georgia. The university was founded as Emory College in 1836 in Oxford, Georgia, by the Methodist Episcopal Church and was named in honor of Methodist bishop John Emory.',
    'In 1915, Emory College moved to its present location in Druid Hills and was rechartered as Emory University. Emory maintained a presence in Oxford that eventually became Oxford College, a residential liberal arts college for the first two years of the Emory baccalaureate degree.[19] The university is the second-oldest private institution of higher education in Georgia and among the fifty oldest private universities in the United States.']

tok = EnglishTokenizer()
pos = POSTagger()
docs = [tok.decode(text) for text in texts]

components = [pos]

for component in components:
    docs = component.decode(docs)

print(docs)
def produce_elit_dep_trees_for_SemanticScholar(input_path: str,
                                               output_path: str) -> None:
    print('[%s] Start process %s into %s' %
          (time.ctime(), input_path, output_path))
    parser = DEPBiaffineParser()
    parser.load()
    pos_tagger = POSFlairTagger()
    pos_tagger.load()
    lemmatizer = EnglishLemmatizer()
    components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser]
    print('ELIT components load DONE')

    with open(input_path) as fopen, open(output_path, 'w') as fwrite:
        for line in tqdm.tqdm(fopen):
            line_dict = json.loads(line.strip())
            docno = line_dict['docno']
            paper_abstract = line_dict['paperAbstract']
            title = line_dict['title']
            file_res = {
                'doc_id': docno,
                'paper_abstract': paper_abstract,
                'title': title
            }
            # process title and abstract
            file_res['title_elit_res'] = []
            file_res['paper_abstract_elit_res'] = []
            # to save processing time
            if len(paper_abstract) > 0 and len(title) > 0:
                docs = [title, paper_abstract]
                for c in components:
                    docs = c.decode(docs)
                for doc in docs:
                    doc['para_id'] = doc['doc_id']  # para_id=0 means headline
                    doc.pop('doc_id')
                    for sent_dict in doc['sens']:
                        sent_dict['dep'] = [(int(_[0]), _[1])
                                            for _ in sent_dict['dep']]
                    if doc['para_id'] == 0:
                        file_res['title_elit_res'].append(doc)
                    else:
                        file_res['paper_abstract_elit_res'].append(doc)
            elif len(paper_abstract) > 0:
                docs = paper_abstract
                for c in components:
                    docs = c.decode(docs)
                for doc in docs:
                    doc['para_id'] = doc['doc_id']  # para_id=0 means headline
                    doc.pop('doc_id')
                    for sent_dict in doc['sens']:
                        sent_dict['dep'] = [(int(_[0]), _[1])
                                            for _ in sent_dict['dep']]
                file_res['paper_abstract_elit_res'] = docs
            elif len(title) > 0:
                docs = title
                for c in components:
                    docs = c.decode(docs)
                for doc in docs:
                    doc['para_id'] = doc['doc_id']  # para_id=0 means headline
                    doc.pop('doc_id')
                    for sent_dict in doc['sens']:
                        sent_dict['dep'] = [(int(_[0]), _[1])
                                            for _ in sent_dict['dep']]
                file_res['title_elit_res'] = docs

            fwrite.write(json.dumps(file_res) + '\n')
        fwrite.close()
def produce_elit_dep_trees_for_nyt_corpus(corpus_dir, output_dir):
    parser = DEPBiaffineParser()
    parser.load()
    pos_tagger = POSFlairTagger()
    pos_tagger.load()
    lemmatizer = EnglishLemmatizer()
    components = [EnglishTokenizer(), pos_tagger, lemmatizer, parser]
    print('ELIT components load DONE')

    #for dir_year in os.listdir(corpus_dir):
    #for dir_year in ['2004', '2005', '2006', '2007']:
    #for dir_year in ['2000', '2001', '2002', '2003']:
    #for dir_year in ['1997', '1998', '1999']:
    for dir_year in ['2003']:
        target_path = '%s/%s.json' % (output_dir, dir_year)
        print('Now dumping result to %s...' % (target_path))
        fwrite = open(target_path, 'w', encoding='utf-8')
        for file_doc in tqdm.tqdm(
                os.listdir(os.path.join(corpus_dir, dir_year))):
            empty_flag = True
            # parse xml file
            cur_path = os.path.join(corpus_dir, dir_year, file_doc)
            xml_tree = ET.parse(cur_path)
            root = xml_tree.getroot()
            docid = file_doc.split('.')[0]
            headline_node = root.find('./body/body.head/hedline/hl1')
            if headline_node != None:
                empty_flag = False
                headline = headline_node.text
            else:
                headline = ''
            content = []
            full_text_node = root.find(
                "./body/body.content/block[@class='full_text']")
            if full_text_node != None:
                empty_flag = False
                for para in full_text_node.findall('p'):
                    content.append(para.text)
            # elit to produce dep tree
            file_res = {
                'headline': headline,
                'content': content,
                'doc_id': docid
            }
            if not empty_flag:
                docs = [headline] + content
                try:
                    for c in components:
                        docs = c.decode(docs)
                    for doc in docs:
                        doc['para_id'] = doc[
                            'doc_id']  # para_id=0 means headline
                        doc.pop('doc_id')
                        for sent_dict in doc['sens']:
                            sent_dict['dep'] = [(int(_[0]), _[1])
                                                for _ in sent_dict['dep']]
                    file_res['elit_res'] = docs
                except mxnet.base.MXNetError:
                    file_res['elit_res'] = []
                    print('%s encounter error when using ELIT parsing' %
                          (docid))
            else:
                file_res['elit_res'] = []
            fwrite.write(json.dumps(file_res) + '\n')
        fwrite.close()
Exemple #12
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-05-28 17:38
from elit.component import NERFlairTagger
from elit.component.tokenizer import EnglishTokenizer
from elit.structure import Document

tagger = NERFlairTagger()
tagger.load()
components = [EnglishTokenizer(), tagger]
docs = 'buy Apple TV'
for c in components:
    docs = c.decode(docs)
for d in docs:  # type: Document
    print(d)
Exemple #13
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-05-30 18:24
from elit.component import POSFlairTagger, SDPBiaffineParser
from elit.component.tokenizer import EnglishTokenizer

parser = SDPBiaffineParser()
parser.load()
pos_tagger = POSFlairTagger()
pos_tagger.load()
components = [EnglishTokenizer(), pos_tagger, parser]
docs = 'Is this the future of chamber music ?'
for c in components:
    docs = c.decode(docs)
for d in docs:  # type: Document
    for sent in d.to_conll():
        print(sent)
    print(d)