Beispiel #1
0
 def __init__(self):
     self.lac = LAC(mode='lac')
     self.lac.load_customization('data/custom.txt', sep=None)
     self.ddparser = DDParser(encoding_model='transformer')
     self.fine_info = FineGrainedInfo
     self.keyword = Keyword()
     self.jieba = jieba
     self.posseg = jieba.posseg
Beispiel #2
0
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.lac = LAC(mode='lac')
        self.lac.load_customization('data/custom.txt', sep=None)
        self.ddparser = DDParser(encoding_model='transformer')
        self.fine_info = FineGrainedInfo
        self.keyword = Keyword()
        self.jieba = jieba
        self.posseg = jieba.posseg
        self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger(
            model_path=os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer(
            os.path.join(LTP_DIR, "ner.model"))
from ddparser import DDParser
from main import build_conllx, cut_sent
import pandas as pd
import stanza
from stanza.utils.conll import CoNLL
import spacy


sample = open('sample_corpus.txt', 'r', encoding='utf-8').read()
sample_sents = cut_sent(sample)

'''Construct the sample corpus'''
# begin ddparser
ddp = DDParser(use_pos=True)
data = ddp.parse(sample_sents)
build_conllx(data, 'sample_ddparser.conllx')
print('DDParser has finished the parsing.')


# begin SpaCy
nlp = spacy.load('zh_core_web_sm')
file_spacy = open('sample_spacy.conllx', 'w', encoding ='utf-8')
# file_spacy_gold = open('gold_spacy.conllx', 'r', encoding='utf-8')
for sent in sample_sents:
    file_spacy.write('\n\n')
    # file_spacy_gold.write('\n\n')
    for idx, token in enumerate(nlp(sent)):
        print(token.text)
        print(token.pos_)
        print(token.dep_)
        line = f'{idx+1}\t{token.text}\t{token.pos_}\t{token.dep_}\t{token.head}\t{token.head.i}'
class SVOParser:
    def __init__(self):
        self.parser = DDParser(use_pos=True)
        print('loaded model')

    '''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识'''

    def split_sents(self, content):
        return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, rel_id, relation):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(rel_id)):
                if rel_id[arc_index] == index+1:   #arcs的索引从1开始
                    if rel_id[arc_index] in child_dict:
                        child_dict[relation[arc_index]].append(arc_index)
                    else:
                        child_dict[relation[arc_index]] = []
                        child_dict[relation[arc_index]].append(arc_index)
            child_dict_list.append(child_dict)
        heads = ['Root' if id == 0 else words[id - 1] for id in rel_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rel_id[i]-1, postags[rel_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        res = self.parser.parse(sentence, )[0]
        words = res["word"]
        postags = res["postag"]
        rel_id = res["head"]
        relation = res["deprel"]

        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, rel_id, relation)
        return words, postags, child_dict_list, format_parse_list

    """将所有的ATT进行合并"""
    def merge_ATT(self, words, postags, format_parse_list):
        words_ = words
        retain_nodes = set()
        ATTs = []
        ATT = []
        format_parse_list_ = []
        for parse in format_parse_list:
            dep = parse[0]
            if dep in ['ATT', 'ADV']:
                ATT += [parse[2], parse[5]]
            else:
                if ATT:
                    body = ''.join([words[i] for i in sorted(set(ATT))])
                    ATTs.append(body)
                    retain_nodes.add(sorted(set(ATT))[-1])
                    words_[sorted(set(ATT))[-1]] = body
                else:
                    retain_nodes.add(parse[2])
                ATT = []
        for indx, parse in enumerate(format_parse_list):
            if indx in retain_nodes:
                parse_ = [parse[0], words_[indx], indx, postags[indx], words_[parse[5]], parse[5], postags[parse[5]]]
                format_parse_list_.append(parse_)
        return words_, postags, format_parse_list_, retain_nodes

    """基于该结果,提取三元组"""
    def extract(self, words, postags, child_dict_list, arcs, retain_nodes):
        svos = []
        for index in range(len(postags)):
            if index not in retain_nodes:
                continue
            tmp = 1
            # 如果语义角色标记为空,则使用依存句法进行抽取
            if postags[index]:
                # 抽取以谓词为中心的事实三元组
                child_dict = child_dict_list[index]
                # 主谓宾
                if 'SBV' in child_dict and 'VOB' in child_dict:
                    # e1s = self.expand_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    # e2s = self.expand_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    r = words[index]
                    e1 = words[child_dict['SBV'][0]]
                    e2 = words[child_dict['VOB'][0]]
                    if e1.replace(' ', '') and e2.replace(' ', ''):
                        svos.append([e1, r, e2])

                # 含有介宾关系的主谓动补关系
                if 'SBV' in child_dict and 'CMP' in child_dict:
                    e1 = words[child_dict['SBV'][0]]
                    cmp_index = child_dict['CMP'][0]
                    r = words[index] + words[cmp_index]
                    if 'POB' in child_dict_list[cmp_index]:
                        e2 = words[child_dict_list[cmp_index]['POB'][0]]
                        if e1.replace(' ', '') and e2.replace(' ', ''):
                            svos.append([e1, r, e2])

        return svos

    '''三元组抽取主函数'''

    def ruler2(self, words, postags, child_dict_list, arcs):
        svos = []
        for index in range(len(postags)):
            tmp = 1
            # 先借助语义角色标注的结果,进行三元组抽取
            if tmp == 1:
                # 如果语义角色标记为空,则使用依存句法进行抽取
                # if postags[index] == 'v':
                if postags[index]:
                    # 抽取以谓词为中心的事实三元组
                    child_dict = child_dict_list[index]
                    # 主谓宾
                    if 'SBV' in child_dict and 'VOB' in child_dict:
                        r = words[index]
                        e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                        if e1.replace(' ', '') and e2.replace(' ', ''):
                            svos.append([e1, r, e2])

                    # 定语后置,动宾关系
                    relation = arcs[index][0]
                    head = arcs[index][2]
                    if relation == 'ATT':
                        if 'VOB' in child_dict:
                            e1 = self.complete_e(words, postags, child_dict_list, head - 1)
                            r = words[index]
                            e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                            temp_string = r + e2
                            if temp_string == e1[:len(temp_string)]:
                                e1 = e1[len(temp_string):]
                            if temp_string not in e1:
                                if e1.replace(' ', '') and e2.replace(' ', ''):
                                    svos.append([e1, r, e2])

                    # 含有介宾关系的主谓动补关系
                    if 'SBV' in child_dict and 'CMP' in child_dict:
                        e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                        cmp_index = child_dict['CMP'][0]
                        r = words[index] + words[cmp_index]
                        if 'POB' in child_dict_list[cmp_index]:
                            e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                            if e1.replace(' ', '') and e2.replace(' ', ''):
                                svos.append([e1, r, e2])
        return svos

    '''对找出的主语或者宾语进行扩展'''

    def complete_e(self, words, postags, child_dict_list, word_index):
        child_dict = child_dict_list[word_index]
        prefix = ''
        if 'ATT' in child_dict:
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
        postfix = ''
        if postags[word_index] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

        return prefix + words[word_index] + postfix

    '''程序主控函数'''

    def triples_main(self, content):
        sentences = self.split_sents(content)
        svos = []
        for sentence in sentences:
            print(sentence)
            words, postags, child_dict_list, arcs = self.parser_main(sentence)
            svo = self.ruler2(words, postags, child_dict_list, arcs)
            svos += svo

        return svos
 def __init__(self):
     self.parser = DDParser(use_pos=True)
     print('loaded model')
Beispiel #6
0
sys.path.append("../../../..")
from ERNIE.tokenization import BasicTokenizer

import pandas as pd
from ddparser import DDParser

TRAIN_PATH = 'train.csv'
DEV_PATH = 'dev.csv'
TEST_PATH = 'test.csv'
use_cuda = True

if __name__ == "__main__":
    file_paths = [TRAIN_PATH, DEV_PATH, TEST_PATH]
    tokenizer = BasicTokenizer()
    ddp = DDParser(use_cuda=use_cuda,
                   encoding_model='transformer',
                   buckets=True,
                   batch_size=1000)

    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t')
        df['ddp_res_a'] = [
            str(ddp_res) for ddp_res in ddp.parse([
                tokenizer._clean_text(query)
                for query in df['text_a'].tolist()
            ])
        ]
        df['ddp_res_b'] = [
            str(ddp_res) for ddp_res in ddp.parse([
                tokenizer._clean_text(query)
                for query in df['text_b'].tolist()
            ])
Beispiel #7
0
            return sub_tokens[0][0]
        tokens, _ = zip(*sub_tokens)

        return "".join(tokens)

    def inorder_traversal(self, node):
        """中序遍历"""
        lf_list = []
        rf_list = []
        for ln in node.lefts:
            if self.nodes[ln].deprel not in ['COO']:
                lf_list += self.inorder_traversal(self.nodes[ln])
        for rn in node.rights:
            if self.nodes[rn].deprel not in ['COO']:
                rf_list += self.inorder_traversal(self.nodes[rn])

        return lf_list + [(node.word, node.deprel)] + rf_list


if __name__ == "__main__":
    ddp = DDParser(encoding_model='transformer')
    text = ["百度是一家高科技公司"]
    ddp_res = ddp.parse(text)
    print(ddp_res)
    # 细粒度
    fine_info = FineGrainedInfo(ddp_res[0])
    print("细粒度:", fine_info.parse())
    # 粗粒度
    coarse_info = CoarseGrainedInfo(ddp_res[0])
    print("粗粒度:", coarse_info.parse())
from ddparser import DDParser
from main import characterlen, count_type, get_corp

ddp = DDParser(use_pos=True)


# count the frequency of a specific deprel term
def count_rel(text, head, info):
    count = 0
    for i, sentence in enumerate(text):
        if i % 1000 == 0:
            print(str(i) + ' sentences have been processed.')
        if sentence[head].count(info) > 0:
            count += sentence[head].count(info)
    return format(count / len(text), '.2f')


# parse the texts of different corpora
sents_1950 = get_corp('corpus_50_65.txt')
data_1950 = ddp.parse(sents_1950)
print('Finish reading the treebank_1950.')

sents_1966 = get_corp('corpus_66_76.txt')
data_1966 = ddp.parse(sents_1966)
print('Finish reading the treebank_1966.')

sents_1978 = get_corp('corpus_78_99.txt')
data_1978 = ddp.parse(sents_1978)
print('Finish reading the treebank_1978.')

sents_2000 = get_corp('corpus_00_10.txt')
Beispiel #9
0
from ddparser import DDParser
import re

# using ddparser
ddp = DDParser(use_pos=True)


# define a function to cut sentences
def cut_sent(text):
    new_sents = []
    sents = re.split(
        r'(。”*"*)*」*|!”*"*)*」*|?”*"*|\.{6}”*」"*)*|……”*)*"*」*|\.」*"*)', text)
    for i in range(int(len(sents) / 2)):
        sent = sents[2 * i] + sents[2 * i + 1]
        new_sents.append(sent)
    return new_sents


# Define a function to count type
def count_type(parsedsents):
    type_list = []
    for sent in parsedsents:
        for word in sent['word']:
            if word not in type_list:
                type_list.append(word)
    return len(type_list)


# define a function to generate conllx format
def build_conllx(data, filename):
    with open(filename, 'w', encoding='utf-8') as file: