Esempio n. 1
0
def jufa_fenxi(words,postags):
    """句法分析"""
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    arcs = parser.parse(words, postags)

    print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
Esempio n. 2
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Esempio n. 3
0
class LtpModel(object):
    """
    封装pyltp model 类,方便使用
    """
    @pysnooper.snoop()
    def __init__(self, LTP_DATA_DIR):
        """加载pyltp模型"""
        self.LTP_DATA_DIR = LTP_DATA_DIR  # pyltp的存放路径

        # 分词模型路径,分词模型名称是 'cws.model'
        cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

        # 词性标注模型路径,分词模型名称是 'pos.model'
        pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
        self.postager = Postagger()
        self.postager.load(pos_model_path)

        # 命名实体识别模型路径,模型名称为'ner.model'
        ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)

        # 依存句法分析模型路径,模型名称为 'parser.model'
        par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)

        # # 语义角色标注模型目录路径,模型目录为'pisrl.model'
        # srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl.model')
        # self.labeller = SementicRoleLabeller()  # 初始化实例
        # self.labeller.load(srl_model_path)  # 加载模型

    def load_model(self):
        # """加载pyltp模型"""
        # # 分词模型路径,分词模型名称是‘cws.model’
        # self.segment = Segmentor()
        # print(cws_model_path)
        # self.segment.load(cws_model_path)

        # # 词性标注模型路径,分词模型名称是‘pos.model’
        # self.postager = Postagger()
        # self.postager.load(pos_model_path)
        #
        # # 命名实体识别模型路径,模型名称为`pos.model`
        # self.recognizer = NamedEntityRecognizer()
        # self.recognizer.load(ner_model_path)
        #
        # # 依存句法分析模型路径,模型名称为`parser.model`
        # self.parser = Parser()
        # self.parser.load(par_model_path)
        #
        # # 语义角色标注模型目录路径,模型目录为`srl`
        # self.labeller = SementicRoleLabeller()  # 初始化实例
        # self.labeller.load(srl_model_path)  # 加载模型

        # 加载word2vec 模型
        pass

    @pysnooper.snoop()
    def release_all_model(self):
        """释放模型"""
        self.segmentor.release()
        self.postager.release()
        self.recognizer.release()
        self.parser.release()
        # word2vec 模型的释放
        pass

    # 分句
    @pysnooper.snoop()
    def split_sentences(self, string):
        sents = SentenceSplitter.split(string)
        sentences = [s for s in sents if len(s) != 0]
        return sentences

    def jieba_word_cut(self, string):
        string = re.findall(
            '[\d|\w|\u3002 |\uff1f |\uff01 |\uff0c |\u3001 |\uff1b |\uff1a |\u201c |\u201d |\u2018 |\u2019 |\uff08 |\uff09 |\u300a |\u300b |\u3008 |\u3009 |\u3010 |\u3011 |\u300e |\u300f |\u300c |\u300d |\ufe43 |\ufe44 |\u3014 |\u3015 |\u2026 |\u2014 |\uff5e |\ufe4f |\uffe5]+',
            string)
        string = ' '.join(string)

        return ' '.join(jieba.cut(string))

    # 分词
    @pysnooper.snoop()
    def split_words(self, sentences):
        sents = [self.jieba_word_cut(s) for s in sentences]
        return sents

    # 词性分析
    @pysnooper.snoop()
    def get_word_pos(self, sents):
        postags = [self.postager.postag(words.split()) for words in sents]
        postags = [list(w) for w in postags]
        return postags

    # 依存句法分析
    @pysnooper.snoop()
    def dependency_parsing(self, sents, postags, said):

        contents = []
        for index in range(len(sents)):
            wo = sents[index].split()
            po = postags[index]

            netags = self.recognizer.recognize(wo, po)  # 命名实体识别
            netags = list(netags)
            # print(netags)
            if ('S-Nh' not in netags) and ('S-Ni' not in netags) and (
                    'S-Ns'
                    not in netags):  # 人名、机构名、地名  当人名、机构名、地名在该句中则进行依存句法分析
                continue

            arcs = self.parser.parse(wo, po)

            arcs = [(arc.head, arc.relation) for arc in arcs]
            # print(arcs)  #[(2, 'SBV'), (0, 'HED'), (5, 'SBV'), (5, 'ADV'), (2, 'VOB')]
            arcs = [(i, arc) for i, arc in enumerate(arcs)
                    if arc[1] == 'SBV']  # SBV 主谓关系 找出主谓关系的句子
            # print(arcs)  #[(0, (2, 'SBV')), (2, (5, 'SBV'))]
            for arc in arcs:
                verb = arc[1][0]  # 2  5
                subject = arc[0]  # 0  1
                if wo[verb -
                      1] not in said:  # 如果wo[verb - 1]这个所对应的词语  在已建词表said中,则打印出来
                    continue
                # print(wo[subject],wo[verb - 1],''.join(wo[verb:]))
                contents.append((wo[subject], wo[verb - 1],
                                 ''.join(wo[verb:])))  # 依次为人物、"说"的近义词、文本

        return contents

    @pysnooper.snoop()
    def get_sentences_json_result(self, string):
        """
        对输入的句子进行SBV提取
        :param string:
        :return: list of dict [{}]
        """

        sentences = self.split_sentences(string)  # 分句
        sents = self.split_words(sentences)  # 分词
        postags = self.get_word_pos(sents)  # 词性分析
        contents = self.dependency_parsing(sents, postags, txt_said)  # 依存句法分析

        # 拼装json结果
        contents_dict = []
        for ones in enumerate(contents):
            # json 字段
            result = {
                'name': ones[1][0],
                'trigger': ones[1][1],
                'content': ones[1][2]
            }
            contents_dict.append(result)
        return contents_dict
Esempio n. 4
0
class Ltp(LtpSegment):
    __model_dir = os.path.join('source', 'ltp_data_v3.4.0')

    # 词性标注
    postagger = Postagger()
    postagger.load(os.path.join(__model_dir, "pos.model"))

    # 命名实体识别
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(__model_dir, "ner.model"))

    # 依存句法分析
    parser = Parser()
    parser.load(os.path.join(__model_dir, "parser.model"))

    # 语义角色标注
    labeller = SementicRoleLabeller()
    labeller.load(os.path.join(__model_dir, "pisrl.model"))

    def __init__(self):
        super().__init__()

    def postag(self, words):
        """
        词性标注
        :param input: 分词结果 list
        :return: 词性 list
        """
        postags = self.postagger.postag(words)
        return list(postags)

    def recognize(self, words, postags):
        """
        命名实体识别:
        1. LTP 采用 BIESO 标注体系:B表示实体开始词;I表示实体中间词;E表示实体结束词;
           S表示单独成实体;O表示不构成命名实体
        2. LTP 提供的命名实体类型为:人名(Nh);地名(Ns);机构名(Ni)
        3. B、I、E、S位置标签和实体类型标签之间用一个横线 - 相连;O标签后没有类型标签
        例如:
            S-Nh 表示单独一个词构成了人名。
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :return: 命名实体标注结果 list
        """
        netags = self.recognizer.recognize(words, postags)
        return list(netags)

    def parse(self, words, postags):
        """
        依存句法分析
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :return: ltp原生结果
            (arc.head, arc.relation) for arc in arcs
            ROOT节点的索引是0,第一个词开始的索引依次为1、2、3
            arc.relation 表示依存弧的关系。
            arc.head 表示依存弧的父节点词的索引,arc.relation 表示依存弧的关系。
        例:
        inputs:
            words = ['元芳', '你', '怎么', '看']
            postags = ['nh', 'r', 'r', 'v']
        output:
            4:SBV 4:SBV 4:ADV 0:HED
            输出格式为 head:relation
        """
        arcs = self.parser.parse(words, postags)
        return arcs

    def label(self, words, postags, arcs):
        """
        语义角色标注
        :param words: 分词结果 list
        :param postags: 词性标注结果 list
        :param arcs: 依存句法分析结果 ltp
        :return: ltp原生结果
            (arg.name, arg.range.start, arg.range.end) for arg in role.arguments
            第一个词开始的索引依次为0、1、2
            返回结果 roles 是关于多个谓词的语义角色分析的结果。由于一句话中可能不含有语义角色,所以
            结果可能为空。role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角
            色。arg.name 表示语义角色类型,arg.range.start 表示该语义角色起始词位置的索引,
            arg.range.end 表示该语义角色结束词位置的索引。
        例:
        inputs:
            words = ['元芳', '你', '怎么', '看']
            postags = ['nh', 'r', 'r', 'v']
            arcs 使用依存句法分析的结果
        output:
            3 A0:(0,0)A0:(1,1)ADV:(2,2)

            由于结果输出一行,所以“元芳你怎么看”有一组语义角色。
            其谓词索引为3,即“看”。
            这个谓词有三个语义角色范围分别是:
                (0,0)即“元芳”,(1,1)即“你”,(2,2)即“怎么”,类型分别是A0、A0、ADV。
        """
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def get_name_entity(self, sentence, entity_type):
        """
        获取句子中特定的命名实体集
        :param sentence: 待分析句子
        :param entity_type: 待分析命名实体类型,可选值
        :return:
        """
        words = self.segment(sentence)
        postags = self.postag(words)
        ne_tags = self.recognize(words, postags)
        sentence_len = len(words)

        ret_entity = set()
        entity_pattern = ""
        for i in range(sentence_len):
            if (ne_tags[i] == 'B-' + entity_type) or (ne_tags[i]
                                                      == 'B-' + entity_type):
                entity_pattern += words[i]
            elif (ne_tags[i] == 'E-' + entity_type) or (ne_tags[i]
                                                        == 'S-' + entity_type):
                entity_pattern += words[i]
                ret_entity.add(entity_pattern)
                entity_pattern = ""

        return list(ret_entity)
Esempio n. 5
0
import os

LTP_DATA_DIR="D:\myprojects\LTP\ltp_data_v3.4.0"
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')     #命名实体识别
srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')

from pyltp import Segmentor,Postagger,Parser,NamedEntityRecognizer,SementicRoleLabeller

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型
recognizer = NamedEntityRecognizer()
recognizer.load(ner_model_path)
labeller = SementicRoleLabeller()
labeller.load(srl_model_path)


line='手机外型很漂亮,屏幕也不错,就是太容易发烫了,电池不耐用,这些都是预想到的,我很少玩游戏就还好。喇叭真的太垃圾了。'

words = list(segmentor.segment(line))
postags = list(postagger.postag(words))
arcs = parser.parse(words, postags)  # 句法分析
netags = recognizer.recognize(words, postags)  # 命名实体识别
roles = labeller.label(words, postags, netags, arcs)
Esempio n. 6
0
def test_ltp():
    from pyltp import Segmentor
    segmentor = Segmentor()
    #segmentor.load('/Users/a000/Downloads/ltp-models/3.3.2/ltp_data.model')
    segmentor.load('/Users/a000/git/ltp_data/cws.model')
    words = segmentor.segment('元芳你怎么看')
    words = segmentor.segment('这本书很好, 我喜欢iphone, 1.5')
    words = segmentor.segment('张子萱怀孕了')
    words = segmentor.segment('我有一本书')
    words = segmentor.segment('今天是2017年3月30日, 清朝的官员')
    words = segmentor.segment('蚂蚁金服近日上市')
    words = segmentor.segment('国家主席习近平抵达美国佛罗里达州')
    words = segmentor.segment('独家|你想要的胸以下全是腿, 科切拉潮人用不')
    total_txt = '<a href=\"http://deeporiginalx.com/search.html#sw=%E7%AC%AC%E4%B8%80%E7%99%BD%E9%93%B6%E7%BD%91\" target=\"_blank\">第一白银网</a>4月19日讯<a href=\"http://deeporiginalx.com/search.html#sw=%E7%8E%B0%E8%B4%A7%E7%99%BD%E9%93%B6\" target=\"_blank\">现货白银</a>今日早盘走势受到美元反弹影响继续走软,目前交投于18.2一线,本周二美国总统特朗普再次提及税改政策,并且宣称将会以“迅雷不及掩耳之势”落地,据小编分析,税改落地将会利好美国经济,从而利好美元,打压白银走势,但问题是,3月份连医改都进展不顺,税改会通过吗?(<a href=\"http://deeporiginalx.com/search.html#sw=%E7%BC%96%E8%BE%91%E6%8E%A8%E8%8D%90%EF%BC%9A%E6%9C%AA%E6%9D%A5%E7%99%BD%E9%93%B6%E8%B5%B0%E5%8A%BF%E5%88%86%E6%9E%90\" target=\"_blank\"><strong><span>编辑推荐:未来白银走势分析</span></strong></a>'
    total_txt = "<span class=\"article_src\">游民星空</span>2017-04-09<span>阅读原文</span>"
    soup = BeautifulSoup(total_txt, 'lxml')
    total_txt = soup.get_text()
    print total_txt
    print type(total_txt)
    words = segmentor.segment(total_txt.encode('utf-8'))
    #words = segmentor.segment(s)
    for i in words:
        print i

    import jieba
    w_jieba = jieba.cut('独家|你想要的胸以下全是腿, 科切拉潮人用不')
    print '!!!!!'
    for i in w_jieba:
        print i

    from pyltp import Postagger
    poser = Postagger()
    poser.load('/Users/a000/git/ltp_data/pos.model')
    #words_pos = poser.postag(words)
    #for i in xrange(len(words_pos)):
    #    print words[i]
    #    print words_pos[i]

    s1 = '张继科:脚伤恢复七八成 现在不是想退役的时候'
    s2 = '张继科:脚伤恢复八成 现在还不是退役的时候'
    #s2 = '张继科和马龙:脚伤恢复八成 现在还不是退役的时候'
    s3 = '张继科:脚伤已恢复7-8成 现在还不是退役的时候'

    s4 = '国际乒联排名:马龙丁宁占据榜首 张继科第四'
    s5 = '国际乒联公布排名:马龙丁宁第一 张继科第四'

    s6 = '国家主席习近平抵达美国佛罗里达州'
    s7 = '习近平抵达美国佛罗里达州'

    s8 = '习近平抵达美国佛罗里达州 同特朗普会晤'
    s9 = '习近平抵达美国佛罗里达州 将与特朗普举行会晤'
    s10 = '习近平抵达美国 将同特朗普举行会晤'
    s11 = '习近平抵达美国佛罗里达州 将同特朗普举行中美元首会晤'

    s12 = '【V观】习近平引用芬兰谚语:没有人的开拓就不会有路'
    s13 = '习近平引用芬兰谚语:没有人的开拓就不会有路'

    s14 = '习近平就圣彼得堡地铁发生爆炸造成伤亡向普京致慰问电'  #
    s15 = '习近平就圣彼得堡地铁爆炸事件向普京致慰问电'  #15135383
    ss16 = '习近平就圣彼得堡市地铁发生爆炸造成严重人员伤亡向普京致慰问电'  #15130013
    ss17 = '习近平就圣彼得堡市地铁爆炸向普京致慰问电'  #15127277

    s16 = '习近平离京对芬兰进行国事访问并赴美国举行中美元首会晤'  #15131991
    s17 = '习近平离京对芬兰进行国事访问并赴美举行中美元首会晤'  #15132864
    s18 = '习近平离京对芬兰共和国进行国事访问并赴美国佛罗里达州举行中美元首会晤'  #15131971
    ws1 = segmentor.segment(s6)
    ws2 = segmentor.segment(s7)
    print '  '.join(ws1)
    print '  '.join(ws2)
    pos1 = poser.postag(ws1)
    pos2 = poser.postag(ws2)
    print ' '.join(pos1)
    print ' '.join(pos2)

    from pyltp import NamedEntityRecognizer
    reco = NamedEntityRecognizer()
    reco.load('/Users/a000/git/ltp_data/ner.model')
    ne1 = reco.recognize(ws1, pos1)
    ne2 = reco.recognize(ws2, pos2)
    print ' '.join(ne1)
    print ' '.join(ne2)

    from pyltp import Parser
    parser = Parser()
    parser.load('/Users/a000/git/ltp_data/parser.model')
    arc1 = parser.parse(ws1, pos1)
    arc2 = parser.parse(ws2, pos2)
    print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc1)
    print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc2)
Esempio n. 7
0
def getRelation(paragraph):
    """
	paragraph: a list of string, each string is a sentence
	return: a list of relations and a dict which records the number of occurrence of differents DSNF
	"""
    relations = []
    dict_DSNF = {
        'num_DSNF1': 0,
        'num_DSNF2': 0,
        'num_DSNF3': 0,
        'num_DSNF7': 0,
    }

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))

    for iteration, sentence in enumerate(paragraph):

        sentence = SentenceSplitter.split(sentence)[0]

        words = segmentor.segment(sentence)
        # print("\t".join(words))

        postags = postagger.postag(words)
        # list-of-string parameter is support in 0.1.5
        # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
        # print("\t".join(postags))

        arcs = parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        netags = recognizer.recognize(words, postags)
        # print("\t".join(netags))

        # labeller = SementicRoleLabeller()
        # labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        # roles = labeller.label(words, postags, arcs)
        # for role in roles:
        #     print(role.index, "".join(
        #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

        entityList = findEntities(netags)
        # print(entityList)
        entities = []
        for i in entityList:
            l = ''
            for j in i:
                l += words[j]
            entities.append(l)

        DSNF1_ret = DSNF1(arcs, entityList, words, netags)
        DSNF2_ret = DSNF2(arcs, entityList, words)
        DSNF3_ret = DSNF3(arcs, entityList, words, postags)
        DSNF7_ret = DSNF7(arcs, entityList, words)
        # print("DSNF1 result: ", DSNF1_ret)
        # print("DSNF2 result: ", DSNF2_ret)
        # print("DSNF3 result: ", DSNF3_ret)
        # print("DSNF7 result: ", DSNF7_ret)
        relation = []
        for r in DSNF1_ret:
            dict_DSNF['num_DSNF1'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF2_ret:
            dict_DSNF['num_DSNF2'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF3_ret:
            dict_DSNF['num_DSNF3'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        for r in DSNF7_ret:
            dict_DSNF['num_DSNF7'] += 1
            new_r = [r[0], r[2], r[1]]
            relation.append((new_r, sentence))
            relations.append((new_r, sentence))
        if len(relation) > 0:
            print("evaluate the " + str(iteration + 1) + "-th sentences")
            print("entities in " + str(iteration + 1) + "-th sentence : ",
                  entities)
            for one in relation:
                r = one[0]
                data = {'sentence': sentence, 'kg': [r[0], r[1], r[2]]}
                # print('r',r)
                key = get_key(data)
                old = DB.kg_mark.find_one({"_id": key})
                if old == None:
                    kg.mark_sentence(key, data)
                else:
                    print("已经存在跳过")
                    continue
                print(one)

                p, softmax = pre(data)
                print("with entities relation: ", r)
                print("预测:", p, "概率:", softmax)
                data['label'] = p
                data['state'] = '4'  #设置状态4独立开来
                print(data)

            # if len(relation)==3:
            # 	print("关系",relation[1],relation[2],relation[0])
            print("--" * 30)

    segmentor.release()
    postagger.release()
    parser.release()
    recognizer.release()
    # labeller.release()

    return relations, dict_DSNF
Esempio n. 8
0
    for k, v in type2questions.items():
        print(k, len(v))
        for i in v[:10]:
            print(i)
    with open('../data/question_type.txt', 'w') as f_out:
        for k, v in type2questions.items():
            f_out.write(k + '\n')
            for i in v:
                tmp = ' '.join(i[0])
                f_out.write(tmp + '\t' + '\t'.join(i[1:]) + '\n')


if __name__ == '__main__':
    # test('2006年7月27日,360安全卫士正式推出。')
    # get_all_questions()
    my_parser = Parser()
    # analysis_questions(my_parser)
    # my_parser.get_question_type('缓刑适用于几年以下的有期徒刑')
    my_parser.read_train_set('../data/BoP2017-DBQA.train.txt')
    count = 0
    for i in range(len(my_parser.articles)):
        res = my_parser.analysis_question(i, debug=False)
        # for i in range(10):
        #     res = my_parser.analysis_question(i, debug=True)
        if res == 0:
            count += 0
        else:
            count += 1.0 / res
    print('score', count / len(my_parser.articles))

    # my_parser.analysis_question(0)
Esempio n. 9
0
File: nlp.py Progetto: 89935/OpenRE
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  #LTP模型文件目录

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        #初始化分词器
        #使用jieba分析,将抽取出的所有实体,作为词典加入jieba中
        for entity in all_entity:
            jieba.add_word(entity, 100000)
        jieba.add_word("天府永藏展", 100000)
        # jieba.add_word("始建于",100000)
        # pynlpir.open()#初始化分词器
        # #添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        # files = os.listdir(user_dict_dir)
        # for file in files:
        #     file_path = os.path.join(user_dict_dir,file)
        #     #文件夹则跳过
        #     if os.path.isdir(file):
        #         continue
        #     with open(file_path,'r',encoding = 'utf-8') as f:
        #         line = f.readline()
        #         while line:
        #             word = line.strip('\n').strip()
        #             pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
        #             line = f.readline()
        #加载ltp模型
        #词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        #命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        #依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if postag_flag or ner_flag or parser_flag:  #可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        """
        采用NLPIR进行分词处理
        Args:
            Sentence:String,句子
            entity_postag : dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemma:list,分词结果
        """
        #添加实体词典
        # if entity_postag:
        #     for entity in entity_postag:
        #         pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))#单个用户加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))#单个用户加入示例
        #分词,不进行词性标注
        result = jieba.cut(sentence)
        # pynlpir.close()  # 释放
        lemmas = []
        for lemma in result:
            lemmas.append(lemma)
        # lemmas = pynlpir.segment(sentence,pos_tagging=False)
        #pynlpir.close() #释放
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        #词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            #存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        #self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  #存储分词后的结果
        postags = []  #存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        #命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  #分词结果
        postags = []  #词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        #依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def getSPO1(self, sentence_list):
        for sentence in sentence_list:
            lemmas = nlp.segment(sentence)

            print(lemmas)

            # 词性标注测试
            print('***' + '词性标注测试' + '***')
            words = nlp.postag(lemmas)
            # for word in words:
            #     print(word.to_string())
            # print(words)

            # 命名实体识别与合并测试
            print('***' + '命名实体识别与合并测试' + '***')
            words_netag = nlp.netag(words)
            # for word in words_netag:
            #     print(word.to_string())

            # 依存句法分析测试
            print('***' + '依存句法分析测试' + '***')
            sentence = nlp.parse(words_netag)
            print(sentence.to_string())

            verb = True
            # entity = "乾清宫"
            for item in sentence.words:
                if (item.head_word == None and item.lemma == verb) or (
                        item.lemma == verb and item.dependency == "COO"
                        and item.head_word.head_word == None):
                    relation_verb = item
                    if item.head_word == None:
                        verbId = item.ID
                    elif item.head_word.head_word == None:
                        verbId = item.ID
                        verbId2 = item.head_word.ID
                    O_dict = dict()
                    S_dict = dict()
                    OBJ = None
                    SUB = None
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.lemma] = SUB.ID
                        if item.dependency == "VOB" and item.head_word.ID == verbId:
                            OBJ = item
                            O_dict[OBJ.lemma] = OBJ.ID
                    if SUB == None:
                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verbId2:
                                # if SUB == None or SUB.lemma != entity:
                                SUB = item
                                S_dict[SUB.lemma] = SUB.ID
                    if OBJ == None:
                        for item in sentence.words:
                            if item.dependency == "VOB" and item.head_word.ID == verbId2:
                                OBJ = item
                                O_dict[OBJ.lemma] = OBJ.ID

                    OBJList = []
                    flag = True
                    while flag == True:
                        len1 = len(S_dict)
                        len2 = len(O_dict)
                        for item in sentence.words:
                            if SUB != None and item.head_word != None:
                                SUBList = S_dict.values()
                                if item.head_word.ID in SUBList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "COO"):
                                    SUBATT = item
                                    S_dict[SUBATT.lemma] = SUBATT.ID
                            if OBJ != None and item.head_word != None:
                                OBJList = O_dict.values()
                                if item.head_word.ID in OBJList and (
                                        item.dependency == "ATT"):
                                    OBJATT = item
                                    O_dict[OBJATT.lemma] = OBJATT.ID
                            if len(S_dict) != len1 or len(O_dict) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict = sorted(O_dict.items(), key=lambda item: item[1])
                    S_dict = sorted(S_dict.items(), key=lambda item: item[1])
                    Object = ""
                    Subject = ""
                    for i in O_dict:
                        Object += i[0]
                    for i in S_dict:
                        Subject += i[0]
                    if SUB != None:
                        print((Subject, verb, Object))

                    S_dict2 = dict()
                    O_dict2 = dict()
                    SUB_COO = None
                    OBJ_COO = None
                    for item in sentence.words:
                        if item.head_word != None:
                            if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID:
                                # if SUB == None or SUB.lemma != entity:
                                SUB_COO = item
                                S_dict2[SUB_COO.lemma] = SUB_COO.ID
                        if item.head_word != None:
                            if item.dependency == "COO" and item.head_word.ID == OBJ.ID:
                                OBJ_COO = item
                                O_dict2[OBJ_COO.lemma] = OBJ_COO.ID

                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if SUB_COO != None and item.head_word != None:
                                SUBList = S_dict2.values()
                                if item.head_word.ID in SUBList and item.dependency == "ATT":
                                    SUBATT = item
                                    S_dict2[SUBATT.lemma] = SUBATT.ID
                            if OBJ_COO != None and item.head_word != None:
                                OBJList = O_dict2.values()
                                if item.head_word.ID in OBJList and item.dependency == "ATT":
                                    OBJATT = item
                                    O_dict2[OBJATT.lemma] = OBJATT.ID
                            if len(S_dict2) != len1 or len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[1])
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[1])
                    if len(O_dict2) or len(S_dict2):
                        if len(O_dict2) == 0:
                            O_dict2 = O_dict
                        if len(S_dict2) == 0:
                            S_dict2 = S_dict

                        Object = ""
                        Subject = ""
                        for i in O_dict2:
                            Object += i[0]
                        for i in S_dict2:
                            Subject += i[0]
                        if SUB != None:
                            print((Subject, verb, Object))

    def getSPO2(self, sentence_list):
        for sentence in sentence_list:
            lemmas = nlp.segment(sentence)

            print(lemmas)

            # 词性标注测试
            print('***' + '词性标注测试' + '***')
            words = self.postag(lemmas)
            # for word in words:
            #     print(word.to_string())
            # print(words)

            # 命名实体识别与合并测试
            print('***' + '命名实体识别与合并测试' + '***')
            words_netag = nlp.netag(words)
            # for word in words_netag:
            #     print(word.to_string())

            # 依存句法分析测试
            print('***' + '依存句法分析测试' + '***')
            sentence = nlp.parse(words_netag)
            print(sentence.to_string())

            # verb = True
            # entity = "乾清宫"
            for item in sentence.words:
                if (item.head_word == None and item.postag == "v") or (
                        item.postag == "v" and item.dependency == "COO"
                        and item.head_word.head_word == None):
                    relation_verb = item
                    if item.head_word == None:
                        verbId = item.ID
                    elif item.head_word.head_word == None:
                        verbId = item.ID
                        verbId2 = item.head_word.ID
                    O_dict = dict()
                    S_dict = dict()
                    OBJ = None
                    SUB = None
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.lemma] = SUB.ID
                        if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\
                                or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                    and item.head_word.head_word.ID
                        == verbId):
                            OBJ = item
                            O_dict[OBJ.lemma] = OBJ.ID
                    if SUB == None:
                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verbId2:
                                # if SUB == None or SUB.lemma != entity:
                                SUB = item
                                S_dict[SUB.lemma] = SUB.ID
                    if OBJ == None:
                        for item in sentence.words:
                            if item.dependency == "VOB" and item.head_word.ID == verbId2:
                                OBJ = item
                                O_dict[OBJ.lemma] = OBJ.ID

                    OBJList = []
                    flag = True
                    while flag == True:
                        len1 = len(S_dict)
                        len2 = len(O_dict)
                        for item in sentence.words:
                            if SUB != None and item.head_word != None:
                                SUBList = S_dict.values()
                                if item.head_word.ID in SUBList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "COO"):
                                    SUBATT = item
                                    S_dict[SUBATT.lemma] = SUBATT.ID
                            if OBJ != None and item.head_word != None:
                                OBJList = O_dict.values()
                                if item.head_word.ID in OBJList and (
                                        item.dependency == "ATT"):
                                    OBJATT = item
                                    O_dict[OBJATT.lemma] = OBJATT.ID
                            if len(S_dict) != len1 or len(O_dict) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict = sorted(O_dict.items(), key=lambda item: item[1])
                    S_dict = sorted(S_dict.items(), key=lambda item: item[1])
                    Object = ""
                    Subject = ""
                    for i in O_dict:
                        Object += i[0]
                    for i in S_dict:
                        Subject += i[0]
                    if SUB != None:
                        print((Subject, relation_verb.lemma, Object))

                    S_dict2 = dict()
                    O_dict2 = dict()
                    SUB_COO = None
                    OBJ_COO = None
                    for item in sentence.words:
                        if item.head_word != None:
                            if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID:
                                # if SUB == None or SUB.lemma != entity:
                                SUB_COO = item
                                S_dict2[SUB_COO.lemma] = SUB_COO.ID
                        if item.head_word != None and OBJ != None:
                            if item.dependency == "COO" and item.head_word.ID == OBJ.ID:
                                OBJ_COO = item
                                O_dict2[OBJ_COO.lemma] = OBJ_COO.ID

                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if SUB_COO != None and item.head_word != None:
                                SUBList = S_dict2.values()
                                if item.head_word.ID in SUBList and item.dependency == "ATT":
                                    SUBATT = item
                                    S_dict2[SUBATT.lemma] = SUBATT.ID
                            if OBJ_COO != None and item.head_word != None:
                                OBJList = O_dict2.values()
                                if item.head_word.ID in OBJList and item.dependency == "ATT":
                                    OBJATT = item
                                    O_dict2[OBJATT.lemma] = OBJATT.ID
                            if len(S_dict2) != len1 or len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[1])
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[1])
                    if len(O_dict2) or len(S_dict2):
                        if len(O_dict2) == 0:
                            O_dict2 = O_dict
                        if len(S_dict2) == 0:
                            S_dict2 = S_dict

                        Object = ""
                        Subject = ""
                        for i in O_dict2:
                            Object += i[0]
                        for i in S_dict2:
                            Subject += i[0]
                        if SUB != None:
                            print((Subject, relation_verb.lemma, Object))

    def getSPO(self, sentence_list):
        for sentence in sentence_list:
            print(sentence)
            lemmas = self.segment(sentence)

            # print(lemmas)

            # 词性标注测试
            # print('***' + '词性标注测试' + '***')
            words = self.postag(lemmas)
            # for word in words:
            #     print(word.to_string())
            # print(words)

            # 命名实体识别与合并测试
            # print('***' + '命名实体识别与合并测试' + '***')
            words_netag = self.netag(words)
            # for word in words_netag:
            #     print(word.to_string())

            # 依存句法分析测试
            # print('***' + '依存句法分析测试' + '***')
            sentence = self.parse(words_netag)
            # print(sentence.to_string())

            # verb = True
            # entity = "乾清宫"
            for item in sentence.words:
                if (item.head_word == None and item.postag == "v") or (
                        item.postag == "v" and item.dependency == "COO"
                        and item.head_word.head_word == None):
                    relation_verb = item
                    if item.head_word == None:
                        verbId = item.ID
                        verbId2 = None
                    elif item.head_word.head_word == None:
                        verbId = item.ID
                        verbId2 = item.head_word.ID
                    O_dict = dict()
                    S_dict = dict()
                    OBJ = None
                    SUB = None
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma
                        if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\
                                or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                    and item.head_word.head_word.ID== verbId):
                            OBJ = item
                            O_dict[OBJ.ID] = OBJ.lemma
                            # if item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" \
                            #             and item.head_word.head_word.ID == verbId:
                            #     verb_p = item.head_word
                            # O_dict[OBJ.lemma] = OBJ.ID
                    if SUB == None:
                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verbId2:
                                # if SUB == None or SUB.lemma != entity:
                                SUB = item
                                S_dict[SUB.ID] = SUB.lemma
                    if OBJ == None:
                        for item in sentence.words:
                            if item.dependency == "VOB" and item.head_word.ID == verbId2:
                                OBJ = item
                                O_dict[OBJ.ID] = OBJ.lemma

                    OBJList = []
                    flag = True
                    while flag == True:
                        len1 = len(S_dict)
                        len2 = len(O_dict)
                        for item in sentence.words:
                            if SUB != None and item.head_word != None:
                                SUBList = S_dict.keys()
                                if item.head_word.ID in SUBList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "COO"):
                                    SUBATT = item
                                    S_dict[SUBATT.ID] = SUBATT.lemma
                            if OBJ != None and item.head_word != None:
                                OBJList = O_dict.keys()
                                if item.head_word.ID in OBJList and (
                                        item.dependency == "ATT"
                                        or item.dependency == "COO"):
                                    OBJATT = item
                                    # if item.dependency!="COO":
                                    O_dict[OBJATT.ID] = OBJATT.lemma
                                    # else:
                                    #     O_dict[OBJATT.ID] = OBJATT.lemma+" "

                            if len(S_dict) != len1 or len(O_dict) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict = sorted(O_dict.items(), key=lambda item: item[0])
                    S_dict = sorted(S_dict.items(), key=lambda item: item[0])
                    Object = ""
                    Subject = ""
                    for i in O_dict:
                        Object += i[1]
                    for i in S_dict:
                        Subject += i[1]
                    if SUB != None:
                        print((Subject, relation_verb.lemma, Object))

                    S_dict2 = dict()
                    O_dict2 = dict()
                    SUB_COO = None
                    OBJ_COO = None
                    for item in sentence.words:
                        if item.head_word != None:
                            if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID:
                                # if SUB == None or SUB.lemma != entity:
                                SUB_COO = item
                                S_dict2[SUB_COO.ID] = SUB_COO.lemma
                        if item.head_word != None and OBJ != None:
                            if item.dependency == "COO" and item.head_word.ID == OBJ.ID:
                                OBJ_COO = item
                                O_dict2[OBJ_COO.ID] = OBJ_COO.lemma

                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if SUB_COO != None and item.head_word != None:
                                SUBList = S_dict2.keys()
                                if item.head_word.ID in SUBList and item.dependency == "ATT":
                                    SUBATT = item
                                    S_dict2[SUBATT.ID] = SUBATT.lemma
                            if OBJ_COO != None and item.head_word != None:
                                OBJList = O_dict2.keys()
                                if item.head_word.ID in OBJList and item.dependency == "ATT":
                                    OBJATT = item
                                    O_dict2[OBJATT.ID] = OBJATT.lemma
                            if len(S_dict2) != len1 or len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                    if len(O_dict2) or len(S_dict2):
                        if len(O_dict2) == 0:
                            O_dict2 = O_dict
                        if len(S_dict2) == 0:
                            S_dict2 = S_dict

                        Object = ""
                        Subject = ""
                        for i in O_dict2:
                            Object += i[1]
                        for i in S_dict2:
                            Subject += i[1]
                        if SUB != None:
                            print((Subject, relation_verb.lemma, Object))
Esempio n. 10
0
class HIT_LTP():
    def __init__(self, MODELDIR):

        self.MODELDIR = MODELDIR

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(MODELDIR, "cws.model"))

        # postags = 863 词性标注集
        # https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, "parser.model"))

        self.srler = SementicRoleLabeller()
        self.srler.load(os.path.join(MODELDIR, "pisrl.model"))

    def ori_segment(self, sentence):

        words = self.segmentor.segment(sentence)
        words = list(words)
        return words

    def ori_pos(self, words):

        postags = self.postagger.postag(words)
        postags = list(postags)
        return postags

    def ori_ner(self, words, postags):

        netags = self.recognizer.recognize(words, postags)
        netags = list(netags)
        return netags

    def ori_parser(self, words, postags):

        arcs = self.parser.parse(words, postags)
        arcs = [[arc.head, arc.relation] for arc in arcs]
        return arcs

    # 在哈工大 ltp 中,默认为最细粒度分词
    def std_seg(self, sentence):

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)

        terms = []
        offe = 0
        for word, postag in zip(words, postags):
            term = {}
            term['word'] = word
            term['nature'] = postag
            term['offset'] = offe
            offe += len(word)
            terms.append(term)

        return terms

    # 加入 ner 的分词,相当于粗粒度分词
    def nlp_seg(self, sentence):

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        words = list(words)
        postags = list(postags)
        netags = list(
            netags
        )  # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O

        chunks = self.get_ner_info(
            netags
        )  # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)]

        num_ners = len(chunks)
        # 得到加入 ner 的 words_ 与 postags_
        words_ = []
        postags_ = []
        if num_ners != 0:
            ner_index = 0
            length = 0
            for i in range(len(words)):
                j = i + length
                if j < len(words):
                    ner_type = chunks[ner_index][0]
                    ner_start = chunks[ner_index][1]
                    ner_end = chunks[ner_index][2]
                    word = words[j]
                    postag = postags[j]
                    if j == ner_start:
                        for k in range(ner_start + 1, ner_end):
                            word += words[k]
                            length += 1
                        postag = ner_type.lower()
                        if ner_index < len(chunks) - 1:
                            ner_index += 1
                    words_.append(word)
                    postags_.append(postag)

        terms = []
        offe = 0
        for word, postag in zip(words_, postags_):
            term = {}
            term['word'] = word
            term['nature'] = postag
            term['offset'] = offe
            offe += len(word)
            terms.append(term)

        return terms

    def std_analysis(self, sentence):

        data = {}
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        words = list(words)
        postags = list(postags)

        arcs = self.parser.parse(words, postags)
        arcs_ = [[arc.head, arc.relation] for arc in arcs]
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)

        data['words'] = words
        data['postags'] = postags
        data['arcs'] = arcs_
        data['child_dict_list'] = child_dict_list

        return data

    def nlp_analysis(self, sentence):

        data = {}
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        words = list(words)
        postags = list(postags)
        netags = list(
            netags
        )  # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O

        chunks = self.get_ner_info(
            netags
        )  # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)]

        num_ners = len(chunks)
        # 得到加入 ner 的 words_ 与 postags_
        words_ = []
        postags_ = []
        if num_ners != 0:
            ner_index = 0
            length = 0
            for i in range(len(words)):
                j = i + length
                if j < len(words):
                    ner_type = chunks[ner_index][0]
                    ner_start = chunks[ner_index][1]
                    ner_end = chunks[ner_index][2]
                    word = words[j]
                    postag = postags[j]
                    if j == ner_start:
                        for k in range(ner_start + 1, ner_end):
                            word += words[k]
                            length += 1
                        postag = ner_type.lower()
                        if ner_index < len(chunks) - 1:
                            ner_index += 1
                    words_.append(word)
                    postags_.append(postag)

        arcs = self.parser.parse(words_, postags_)
        arcs_ = [[arc.head, arc.relation] for arc in arcs]
        child_dict_list = self.build_parse_child_dict(words_, postags_, arcs)

        data['words'] = words_
        data['postags'] = postags_
        data['arcs'] = arcs_
        data['child_dict_list'] = child_dict_list
        return data

    # 基于“细粒度词”的 ner
    def ner(self, sentence):

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        words = list(words)
        postags = list(postags)
        netags = list(
            netags
        )  # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O
        chunks = self.get_ner_info(
            netags
        )  # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)]

        ner_info = []
        for chunk in chunks:
            ner_type = chunk[0]
            ner_start = chunk[1]
            ner_end = chunk[2]
            ner_name = ''.join(words[ner_start:ner_end])
            ner_offe = 0
            for i in range(len(words)):
                if i == ner_start:
                    break
                ner_offe += len(words[i])
            ner_info.append({
                'ner_name': ner_name,
                'ner_type': ner_type,
                'ner_offe': ner_offe
            })
        return ner_info

    def parser(self, sentence):

        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        arcs = self.parser.parse(words, postags)
        arcs = [[arc.head, arc.relation] for arc in arcs]

        return arcs

    # 可能会有多个
    def get_core_words(self, sentence, words=None, postags=None):

        core_words_info = []
        core_words_indexs = []
        if words is None:
            words = self.segmentor.segment(sentence)
            words = list(words)
        if postags is None:
            postags = self.postagger.postag(words)
            postags = list(postags)
        arcs = self.parser.parse(words, postags)
        arcs_ = [[arc.head, arc.relation] for arc in arcs]
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)
        for i in range(len(arcs_)):
            if arcs_[i][1] == 'HED':
                core_words_indexs.append(i)
                self.complete_core_words(core_words_indexs, i, child_dict_list)

        for i in core_words_indexs:
            word = words[i]
            offe = len(''.join(words[0:i]))
            temp_dic = {}
            temp_dic['word'] = word
            temp_dic['offe'] = offe
            core_words_info.append(temp_dic)

        return core_words_info

    # 为了更灵活,words = None, postags = None 可解耦
    def get_srl_triple(self, sentence, words=None, postags=None):

        data = {}
        if words is None:
            words = self.segmentor.segment(sentence)
            words = list(words)
        if postags is None:
            postags = self.postagger.postag(words)
            postags = list(postags)
        netags = self.recognizer.recognize(words, postags)
        netags = list(netags)
        arcs = self.parser.parse(words, postags)
        arcs_ = [[arc.head, arc.relation] for arc in arcs]
        roles = self.srler.label(words, postags, arcs)

        # 可能有多组角色
        triple_info = []
        for role in roles:

            tem_dic = {}
            triple = ['', '', '']
            TMP = ''
            LOC = ''

            role = role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ])

            predicate = words[role[0]]

            triple[1] = predicate

            args = role[1].split(")")
            args.remove('')
            for ele in args:
                ele = ele.split(":")
                if ele[0] == "A0":
                    index = ele[1][1:].split(",")
                    A0 = words[int(index[0]):int(index[1]) + 1]
                    A0_str = "".join(A0)
                    triple[0] = A0_str
                if ele[0] == "A1":
                    index = ele[1][1:].split(",")
                    A1 = words[int(index[0]):int(index[1]) + 1]
                    A1_str = "".join(A1)
                    triple[2] = A1_str
                if ele[0] == "TMP":
                    index = ele[1][1:].split(",")
                    tmp = words[int(index[0]):int(index[1]) + 1]
                    tmp_str = "".join(tmp)
                    TMP = tmp_str
                if ele[0] == "LOC":
                    index = ele[1][1:].split(",")
                    loc = words[int(index[0]):int(index[1]) + 1]
                    loc_str = "".join(loc)
                    LOC = loc_str

            tem_dic['role'] = role
            tem_dic['predicate'] = predicate
            tem_dic['triple'] = triple
            tem_dic['TMP'] = TMP
            tem_dic['LOC'] = LOC

            triple_info.append(tem_dic)

        chunks = self.get_ner_info(
            netags
        )  # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)]
        ner_info = []
        for chunk in chunks:
            ner_type = chunk[0]
            ner_start = chunk[1]
            ner_end = chunk[2]
            ner_name = ''.join(words[ner_start:ner_end])
            ner_offe = 0
            for i in range(len(words)):
                if i == ner_start:
                    break
                ner_offe += len(words[i])
            ner_info.append({
                'ner_name': ner_name,
                'ner_type': ner_type,
                'ner_offe': ner_offe
            })
        data['words'] = words
        data['postags'] = postags
        data['arcs'] = arcs_
        data['triple_info'] = triple_info
        data['ner_info'] = ner_info

        return data

    def get_parser_triple(self, sentence, words=None, postags=None):

        data = {}
        if words is None:
            words = self.segmentor.segment(sentence)
            words = list(words)
        if postags is None:
            postags = self.postagger.postag(words)
            postags = list(postags)

        netags = self.recognizer.recognize(words, postags)
        netags = list(netags)
        arcs = self.parser.parse(words, postags)
        arcs_ = [[arc.head, arc.relation] for arc in arcs]
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)

        triple_info = []
        for index in range(len(postags)):
            # 抽取以谓词为中心的事实三元组
            if postags[index] == 'v':
                child_dict = child_dict_list[index]

                # 主谓宾
                if 'SBV' in child_dict and 'VOB' in child_dict:
                    e1 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0])
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['VOB'][0])
                    temp_dic = {}
                    temp_dic['triple'] = [e1, r, e2]
                    temp_dic['type'] = '主谓宾'
                    triple_info.append(temp_dic)

                # 定语后置,动宾关系
                # 进行v 正式 访问vob 的 缅甸国务资政昂山素季sbv
                # 动宾,补主语
                elif arcs[index].relation == 'ATT':
                    if 'VOB' in child_dict:
                        e1 = self.complete_e(words, postags, child_dict_list,
                                             arcs[index].head - 1)
                        r = words[index]
                        e2 = self.complete_e(words, postags, child_dict_list,
                                             child_dict['VOB'][0])
                        temp_string = r + e2
                        if temp_string == e1[:len(temp_string)]:
                            e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                            temp_dic = {}
                            temp_dic['triple'] = [e1, r, e2]
                            temp_dic['type'] = '补主'
                            triple_info.append(temp_dic)

                # 含有介宾关系的主谓动补关系
                # 哈立德sbv 居住 在cmp(动补结构) 土耳其pob
                # 主谓,补宾语
                elif 'SBV' in child_dict and 'CMP' in child_dict:
                    #e1 = words[child_dict['SBV'][0]]
                    e1 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0])
                    cmp_index = child_dict['CMP'][0]
                    r = words[index] + words[cmp_index]
                    if 'POB' in child_dict_list[cmp_index]:
                        e2 = self.complete_e(
                            words, postags, child_dict_list,
                            child_dict_list[cmp_index]['POB'][0])
                        temp_dic = {}
                        temp_dic['triple'] = [e1, r, e2]
                        temp_dic['type'] = '补宾'
                        triple_info.append(temp_dic)

                # 主谓
                elif 'SBV' in child_dict:
                    e1 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0])
                    r = words[index]
                    temp_dic = {}
                    temp_dic['triple'] = [e1, r, '']
                    temp_dic['type'] = '主谓'
                    triple_info.append(temp_dic)

                # 谓宾
                elif 'VOB' in child_dict:
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['VOB'][0])
                    temp_dic = {}
                    temp_dic['triple'] = ['', r, e2]
                    temp_dic['type'] = '谓宾'
                    triple_info.append(temp_dic)

                # FOB 宾语前置  '中泰数字经济合作部级对话机制第一次会议在云南昆明召开'
                elif 'FOB' in child_dict:
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list,
                                         child_dict['FOB'][0])
                    temp_dic = {}
                    temp_dic['triple'] = ['', r, e2]
                    temp_dic['type'] = '宾前'
                    triple_info.append(temp_dic)
        chunks = self.get_ner_info(
            netags
        )  # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)]
        ner_info = []
        for chunk in chunks:
            ner_type = chunk[0]
            ner_start = chunk[1]
            ner_end = chunk[2]
            ner_name = ''.join(words[ner_start:ner_end])
            ner_offe = 0
            for i in range(len(words)):
                if i == ner_start:
                    break
                ner_offe += len(words[i])
            ner_info.append({
                'ner_name': ner_name,
                'ner_type': ner_type,
                'ner_offe': ner_offe
            })

        core_words_info = self.get_core_words(sentence,
                                              words=words,
                                              postags=postags)

        triple_info, core_words_info_ = self.triple_info_futher_merge(
            core_words_info, triple_info)

        data['words'] = words
        data['postags'] = postags
        data['arcs'] = arcs_
        data['core_words_info'] = core_words_info_
        data['triple_info'] = triple_info
        data['ner_info'] = ner_info
        return data

    # 当三元组的谓词在 core_words_info 中时(谓词并列关系 COO):
    # 如果谓词连续,且一个 triple 缺主语,一个缺宾语,则合并两谓词与主与宾语
    # 如果谓词不连续,且 triple 存在主语缺失或是宾语缺失的情况,则分别补全主语和宾语
    def triple_info_futher_merge(self, core_words_info, triple_info):

        core_words_info_ = []
        for i in range(len(core_words_info) - 1):
            # 合并后需返回合并后的 core_words_info
            if core_words_info[i]['offe'] + len(core_words_info[
                    i + 1]['word']) == core_words_info[i + 1]['offe']:  # 相邻
                triple_ = ['', '', '']
                condition = 0
                #print('len(triple_info)',len(triple_info))
                for j in range(len(triple_info)):
                    #print('j', j)
                    #print('triple_info', triple_info)
                    if triple_info[j]['triple'][1] == core_words_info[i][
                            'word']:
                        if j + 1 < len(triple_info):
                            if triple_info[j +
                                           1]['triple'][1] == core_words_info[
                                               i + 1]['word']:
                                triple_[0] = triple_info[j]['triple'][0]
                                triple_[2] = triple_info[j + 1]['triple'][2]
                                triple_[1] = triple_info[j]['triple'][
                                    1] + triple_info[j + 1]['triple'][1]
                                triple_dic = {}
                                triple_dic['triple'] = triple_
                                triple_dic['type'] = '主谓宾'
                                triple_info[j] = triple_dic
                                condition = 1
                                break
                if condition == 1:
                    core_words_info_.append({
                        'word': triple_[1],
                        'offe': core_words_info[i]['offe']
                    })
                else:
                    core_words_info_.append(core_words_info[i])
            else:  # 不相邻
                sub = ''
                obj = ''
                for triple in triple_info:
                    if triple['triple'][0] != '':
                        sub = triple['triple'][0]
                    if triple['triple'][2] != '':
                        obj = triple['triple'][2]
                for triple in triple_info:
                    if triple['triple'][0] == '':
                        triple['triple'][0] = sub
                    if triple['triple'][2] == '':
                        triple['triple'][2] = obj
                core_words_info_.append(core_words_info[i])

        # 补最后一个
        for i in range(len(core_words_info)):
            if i == len(core_words_info) - 1:
                core_words_info_.append(core_words_info[i])

        # print(core_words_info)
        # print(triple_info)
        return triple_info, core_words_info_

    def restart(self):

        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.srler.release()

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        self.srler = SementicRoleLabeller()
        self.srler.load(os.path.join(self.MODELDIR, "pisrl.model"))

    def release(self):

        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.srler.release()

    # =================== 以下为相关工具方法 ===================  ===================  ===================  ===================
    def get_ner_type(self, tag_name):

        tag_class = tag_name.split('-')[0]  # B I S E O
        tag_type = tag_name.split('-')[-1]  # Ni Ns Nh
        return tag_class, tag_type

    def get_ner_info(self, netags):

        default = "O"

        chunks = []

        # 定义 类别 和 起始索引
        chunk_type, chunk_start = None, None

        for i, tok in enumerate(netags):
            # End of a chunk 1
            if tok == default and chunk_type is not None:
                # Add a chunk.
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)

                chunk_type, chunk_start = None, None

            elif tok != default:
                tok_chunk_class, tok_chunk_type = self.get_ner_type(tok)
                # 处理 tok_chunk_class
                if tok_chunk_class != 'e' and tok_chunk_class != 'm':
                    # 第一次...
                    # start of a chunk
                    if chunk_type is None:
                        chunk_type, chunk_start = tok_chunk_type, i

                    # End of a chunk + start of a chunk!
                    elif tok_chunk_type != chunk_type or tok_chunk_class == "b" or tok_chunk_class == "s":
                        chunk = (chunk_type, chunk_start, i)
                        chunks.append(chunk)
                        chunk_type, chunk_start = tok_chunk_type, i
            else:
                pass

        # end condition
        if chunk_type is not None:
            chunk = (chunk_type, chunk_start, len(netags))
            chunks.append(chunk)

        return chunks

    def build_parse_child_dict(self, words, postags, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)

            child_dict_list.append(child_dict)
        return child_dict_list

    '''
    # 1、ATT定中关系,2、动宾短语实体,3、从父节点向子节点遍历
    def complete_e(self, words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        if 'ATT' in child_dict:
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
        
        postfix = ''
        if postags[word_index] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
    
        return prefix + words[word_index] + postfix
    '''
    '''
    def complete_e(self, words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        postfix = ''
        if 'ATT' in child_dict:
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
        if 'COO' in child_dict:
            for i in range(len(child_dict['COO'])):
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['COO'][i])
    
        if postags[word_index] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
    
        return prefix + words[word_index] + postfix
    '''

    def complete_e(self, words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        postfix = ''
        if 'ATT' in child_dict:
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list,
                                          child_dict['ATT'][i])
        if 'COO' in child_dict:
            for i in range(len(child_dict['COO'])):
                if child_dict['COO'][
                        i] - word_index == 1:  #如果并列的主语和宾语在原文中有分割,则用‘、’分割
                    # if postags[child_dict['COO'][i]]=='j':#考虑词性,可能不够全面
                    postfix += self.complete_e(words, postags, child_dict_list,
                                               child_dict['COO'][i])
                else:
                    postfix += '、' + self.complete_e(
                        words, postags, child_dict_list, child_dict['COO'][i])

        if postags[word_index] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list,
                                           child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix = self.complete_e(words, postags, child_dict_list,
                                         child_dict['SBV'][0]) + prefix

        return prefix + words[word_index] + postfix

    # 完善 HED 的 COO 关系
    def complete_core_words(self, core_words_indexs, hed_index,
                            child_dict_list):

        if 'COO' in child_dict_list[hed_index].keys():
            core_words_indexs += child_dict_list[hed_index]['COO']
            for i in child_dict_list[hed_index]['COO']:
                self.complete_core_words(core_words_indexs, i, child_dict_list)
Esempio n. 11
0
if __name__ == "__main__":
    args = parse_args()
    LTP_DATA_DIR = '../data/ltp_data'

    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  #分词
    par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  #语法分析

    segmentor = Segmentor()
    segmentor.load(cws_model_path)

    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    parser = Parser()
    parser.load(par_model_path)

    entity_name = "母婴"
    #sentence =[ "专门打电话来问我要不要买手机","最近想买部手机","我想入手一部索尼的手机,主要用于日常拍摄和毕业旅行"]

    #mode_list=['val','test','train']
    trigger_words = [
        '小孩', '孩子', '宝宝', '妈妈', '月子', '奶瓶', '幼儿', '新生儿', '儿童', '儿子', '女儿',
        '婴儿', '胎儿', '爸爸', '母爱', '麻麻', '避孕', '妊娠', '孕期', '孕妇', '母乳', '疫苗', '辣妈',
        '妈', '当妈', '怀孕', '宝妈', '母婴', '孕妈', '奶爸', '宝贝', '辅食', '奶粉', '男孩', '女孩',
        '男宝', '女宝', '女宝', '湿疹', '父母', '母亲', '父亲', '元素', '微量元素', '臭臭', '哺乳',
        '米粉', '父教', '产妇', '堕胎', '纸尿裤', '尿裤', '娃', '小儿', '尿不湿', '回奶', '断奶',
        '早教', '胎教', '吐奶', '待产', '宝', '童车', '孕前', '孕', '奶嘴', '早产', '冲奶', '育儿',
        '月嫂', '叶酸', '二胎', '吸乳', '乳汁', '产前', '产后', '奶水', '亲子装'
    ]
Esempio n. 12
0
    +------------+-----+----------------------------+----------------------------+
    | 核心关系   | HED | head                       | 指整个句子的核心           |
    +------------+-----+----------------------------+----------------------------+

"""

from __future__ import unicode_literals
import logging
from pyltp import Parser
from . import ltp_model_loader
from .ltp_cloud import dp_online

__all__ = ['dependency_parsing']

# 加载模型
parser = Parser()
ltp_model_loader.load(parser)


def dependency_parsing(words, postags, online=False):
    """ 句法分析
    :param words: 分词结果
    :param postags: 词性标注结果
    :return: 句法分析树
    """
    # online=True, 使用 ltp-cloud 做句法分析
    if online:
        return dp_online(words, postags)

    # 使用本地 ltp 做句法分析
    arcs = parser.parse([i.encode('utf-8') for i in words],
Esempio n. 13
0
class LtpAnalysis(object):
    def __init__(self):
        self.postagger = Postagger()
        self.parser = Parser()
        self.parser.load(par_model_path)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)
        self.labeller = SementicRoleLabeller()
        self.labeller.load(srl_model_path)
        self.postagger.load_with_lexicon(pos_model_path,
                                         '/home/wangwei/conf/posttags.txt')

    def LtpRecon(self, sents):
        """
        分词,词性,句法,命名实体识别,语义识别
        :param sents:
        :return:
        """
        #分词
        words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)]
        logger.info('\t'.join(words))
        #词性
        postags = self.postagger.postag(words)
        logger.info('\t'.join(postags))
        #句法
        arcs = self.parser.parse(words, postags)
        logger.info("\t".join("%d:%s" % (arc.head, arc.relation)
                              for arc in arcs))
        #实体识别
        netags = self.recognizer.recognize(words, postags)
        logger.info('\t'.join(netags))
        #语义标注
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            print role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ])

        self.words, self.postags, self.arcs, self.netags, self.roles = \
            words, postags, arcs, netags, roles

    def deal_arc(self):
        drelation = {}  #word_index:(arc.head,arc.relation)
        num = -1
        for arc in self.arcs:
            num += 1
            k = str(num) + '#' + (arc.head - 1)
            drelation[k] = arc.relation
        return self.drelation

    def vob(self, index):
        num = -1
        for arc in self.arcs:
            num += 1
            if arc.relation in ['VOB'] and (arc.head - 1) == index:
                return self.words[num]

    def att(self, att):
        num = -1

    def post(self, target):
        """
         评价对象的扩展 ,解决ATT
         :param num:
         :return:
         """
        obj = set()
        obj.add(target)
        num = 0
        for arc in self.arcs:
            if (arc.head - 1) == target and arc.relation == 'ATT':
                obj.add(arc.head - 1)
                obj |= self.post(num)
            num += 1
        return obj

    def analysis(self, sents):
        self.LtpRecon(sents)
        # self.deal_arc()
        num = -1
        for arc in self.arcs:
            num += 1
            if arc.relation == 'SBV':
                vob_word = self.vob(arc.head - 1)
                att = self.post(num)
                attword = ''.join([self.words[i] for i in att if i != num])
                print attword, self.words[num], self.words[arc.head -
                                                           1], vob_word
Esempio n. 14
0
LTP_DATA_DIR = './ltp_data_v3.4.0'  # ltp模型目录的路径
par_model_path = os.path.join(LTP_DATA_DIR,
                              'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词
pos_model_path = os.path.join(LTP_DATA_DIR,
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model')  # 语义角色标注模型目录路径

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型

labeller = SementicRoleLabeller()  # 初始化实例
labeller.load(srl_model_path)  # 加载模型


def cht_to_chs(line):  # 转换繁体到简体
    line = Converter('zh-hans').convert(line)
    line.encode('utf-8')
    return line


def load_data():  # 从手动标注集加载数据
    words = []
    labA = []
Esempio n. 15
0
class SentenceParser:
    """
    A class for sentence analysis
    """
    def __init__(self):
        """
        Load remote lexicon and ltp model
        """
        self.temp_lexicon = "temp_lexicon"
        self.fetch_lexicon()

        self.sentence_splitter = SentenceSplitter()
        self.segment = Segmentor()
        self.segment.load_with_lexicon(CWS_MODEL, self.temp_lexicon)
        self.pos = Postagger()
        self.pos.load_with_lexicon(POS_MODEL, self.temp_lexicon)
        self.tree_parser = Parser()
        self.tree_parser.load(PARSER_MODEL)

    def fetch_lexicon(self):
        """
        Load lexicon and write to local
        """
        res = db.fetch_lexicon()
        with open(self.temp_lexicon, "w", encoding="utf8") as f:
            for item in res:
                token, synonym, norm_token, pos = item
                pos = pos.replace(",", " ")
                token = "%s %s" % (token, pos)
                norm_token = "%s %s" % (norm_token, pos)
                if synonym:
                    synonym = "\n".join(
                        list(
                            map(lambda x: "%s %s" % (x, pos),
                                synonym.split(","))))
                    f.write("%s\n%s\n%s\n" % (token, synonym, norm_token))
                else:
                    f.write("%s\n%s\n" % (token, norm_token))

    def seg_sentence(self, text):
        """
        Segment sentence by punctuation
        :param text: raw string
        :return: vector of sentences, use list() to covert as [sentence0, sentence1, ...]
        """
        return self.sentence_splitter.split(text)

    def seg_token(self, text):
        """
        Segment token by model and lexicon
        :param text: raw string
        :return: vector of tokens use list() to convert as [token0, token1, ...]
        """
        return self.segment.segment(text)

    def pos_tag(self, text):
        """
        Tag position of speech for text by model and lexicon
        :param text: raw string
        :return: vector of pos, use list() to convert as [pos0, pos1, ...]
        """
        tokens = self.seg_token(text)
        return self.pos.postag(tokens)

    def parse_list(self, text):
        """
        Parse the sentence as a list of word node
        :param text: raw string
        :return: a list of word node
        """
        result = []
        words = self.seg_token(text)
        pos_list = self.pos.postag(words)
        if len(words) == 0 or len(pos_list) == 0:
            return result
        arcs = self.tree_parser.parse(words, pos_list)

        nodes = list(map(lambda x: (x.head, x.relation), arcs))
        for token, pos, relation in zip(words, pos_list, nodes):
            word_node = WordNode(token, pos, relation[1])
            result.append(word_node)
        return result

    def parse_tree(self, text):
        """
        Parse the sentence as a dependence tree of word node
        :param text: raw string
        :return: a dependence tree of word node
        """
        words = self.seg_token(text)
        pos = self.pos.postag(words)
        if len(words) == 0 or len(pos) == 0:
            return WordNode("", "", "", None)
        arcs = self.tree_parser.parse(words, pos)
        nodes = list(map(lambda x: (x.head, x.relation), arcs))

        root_idx = find_x(nodes, 0)
        root = WordNode(words[root_idx[0]], pos[root_idx[0]],
                        nodes[root_idx[0]][1])
        tree = {root_idx[0]: root}
        queue = root_idx

        while len(queue):
            next_idx = queue.pop()
            for idx in find_x(nodes, next_idx + 1):
                queue.insert(0, idx)
                new_node = WordNode(words[idx], pos[idx], nodes[idx][1])
                tree[next_idx].next.append(new_node)
                tree[idx] = new_node
        return root
Esempio n. 16
0
class SentenceParser:
    """
    A class for sentence analysis
    """
    def __init__(self):
        """
        Load remote lexicon and ltp model
        """
        self.temp_lexicon = "temp_lexicon"
        self.fetch_lexicon()

        self.sentence_splitter = SentenceSplitter()
        self.segment = Segmentor()
        self.segment.load_with_lexicon(CWS_MODEL, self.temp_lexicon)
        self.pos = Postagger()
        self.pos.load_with_lexicon(POS_MODEL, self.temp_lexicon)
        self.tree_parser = Parser()
        self.tree_parser.load(PARSER_MODEL)

        self.rules = IterDocument("data/rule")

    def fetch_lexicon(self):
        """
        Load lexicon and write to local
        """
        res = db.fetch_lexicon()
        with open(self.temp_lexicon, "w", encoding="utf8") as f:
            for item in res:
                token, synonym, norm_token, pos = item
                pos = pos.replace(",", " ")
                token = "%s %s" % (token, pos)
                norm_token = "%s %s" % (norm_token, pos)
                if synonym:
                    synonym = "\n".join(
                        list(
                            map(lambda x: "%s %s" % (x, pos),
                                synonym.split(","))))
                    f.write("%s\n%s\n%s\n" % (token, synonym, norm_token))
                else:
                    f.write("%s\n%s\n" % (token, norm_token))

    def seg_sentence(self, text):
        """
        Segment sentence by punctuation
        :param text: raw string
        :return: vector of sentences, use list() to covert as [sentence0, sentence1, ...]
        """
        return self.sentence_splitter.split(text)

    def seg_token(self, text):
        """
        Segment token by model and lexicon
        :param text: raw string
        :return: vector of tokens use list() to convert as [token0, token1, ...]
        """
        return self.segment.segment(text)

    def pos_tag(self, text):
        """
        Tag position of speech for text by model and lexicon
        :param text: raw string
        :return: vector of pos, use list() to convert as [pos0, pos1, ...]
        """
        tokens = self.seg_token(text)
        return self.pos.postag(tokens)

    def parse_list(self, text, need_info=False):
        """
        Parse the sentence as a list of word node
        :param need_info: whether need extra info
        :param text: raw string
        :return: a list of word node
        """
        result = []
        words = self.seg_token(text)
        pos_list = self.pos.postag(words)
        if len(words) == 0 or len(pos_list) == 0:
            return result
        arcs = self.tree_parser.parse(words, pos_list)

        nodes = list(map(lambda x: (x.head, x.relation), arcs))
        for token, pos, relation in zip(words, pos_list, nodes):
            if need_info:
                info = db.get_word(token)
                if info:
                    category, norm_token, extra = info
                    word_node = WordNode(token, pos, relation[1], category,
                                         norm_token, extra)
                else:
                    word_node = WordNode(token, pos, relation[1])
            else:
                word_node = WordNode(token, pos, relation[1])
            result.append(word_node)
        return result

    def parse_tree(self, text, need_info=False):
        """
        Parse the sentence as a dependence tree of word node
        :param need_info: whether need extra info
        :param text: raw string
        :return: a dependence tree of word node
        """
        words = self.seg_token(text)
        pos = self.pos.postag(words)
        if len(words) == 0 or len(pos) == 0:
            return WordNode("", "", "", None)
        arcs = self.tree_parser.parse(words, pos)
        nodes = list(map(lambda x: (x.head, x.relation), arcs))

        root_idx = find_x(nodes, 0)
        if need_info:
            info = db.get_word(words[root_idx[0]])
            if info:
                category, norm_token, extra = info
                root = WordNode(words[root_idx[0]], pos[root_idx[0]],
                                nodes[root_idx[0]][1], category, norm_token,
                                extra)
            else:
                root = WordNode(words[root_idx[0]], pos[root_idx[0]],
                                nodes[root_idx[0]][1])
        else:
            root = WordNode(words[root_idx[0]], pos[root_idx[0]],
                            nodes[root_idx[0]][1])
        tree = {root_idx[0]: root}
        queue = root_idx

        while len(queue):
            next_idx = queue.pop()
            for idx in find_x(nodes, next_idx + 1):
                queue.insert(0, idx)
                if need_info:
                    info = db.get_word(words[root_idx[0]])
                    if info:
                        category, norm_token, extra = info
                        new_node = WordNode(words[idx], pos[idx],
                                            nodes[idx][1], category,
                                            norm_token, extra)
                    else:
                        new_node = WordNode(words[idx], pos[idx],
                                            nodes[idx][1])
                else:
                    new_node = WordNode(words[idx], pos[idx], nodes[idx][1])
                tree[next_idx].next.append(new_node)
                tree[idx] = new_node
        return root

    def extract(self, path):
        res = []
        for rule in self.rules:
            window_size = len(rule.split(";"))
            if len(path) == window_size:
                if ";".join(map(lambda x: "%s,%s" % (x.relation, x.pos),
                                path)) == rule:
                    res.append(" ".join(map(lambda x: x.token, path)))
            else:
                for i in range(len(path) - window_size):
                    p_slice = ";".join(
                        map(lambda x: "%s,%s" % (x.relation, x.pos),
                            path[i:i + window_size]))
                    if p_slice == rule:
                        res.append(" ".join(
                            map(lambda x: x.token, path[i:i + window_size])))
                        break
        return res
Esempio n. 17
0
######### LTP分词准备 ##########
import os
import re
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer

LTP_DIR = ".../ltp_data"
# 分词
# B词首,I词中,E词尾,S单字成词
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))
# 词性标注
postagger = Postagger()
postagger.load(os.path.join(LTP_DIR, "pos.model"))
# 依存句法分析
parser = Parser()
parser.load(os.path.join(LTP_DIR, "parser.model"))
# 命名实体识别
# O这个词不是NE, S这个词单独构成一个NE,B这个词为一个NE的开始,I这个词为一个NE的中间,E这个词为一个NE的结尾
# Nh人名,Ni机构名,Ns地名
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(LTP_DIR, "ner.model"))


######### 长句切分 ##########
# 输入内容
# 输出短句列表
# 将一个长句子去除干扰字符后按指定标点切分
def seg_long_sents(content):
    # 去除空格' ', 中文全角空格 '\u3000', 中文破折号'——'
    # 之后根据‘?’ ‘!’ '?' '!' '。' 换行 回车 切分长句
class NLP:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir: str,用户自定义词典目录
        default_model_dir: str,ltp模型文件目录
    """
    default_user_dict_dir = '../../resource/'  # 默认的用户词典目录,清华大学法律词典
    default_model_dir = '../../model/'  # ltp模型文件目录

    def __init__(self,
                 user_dict_dir=default_user_dict_dir,
                 model_dir=default_model_dir):
        self.default_user_dict_dir = user_dict_dir
        self.default_model_dir = model_dir
        # 初始化分词器
        # pynlpir.open()  # 初始化分词器
        # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快
        files = os.listdir(user_dict_dir)
        for file in files:
            file_path = os.path.join(user_dict_dir, file)
            # 文件夹则跳过
            if os.path.isdir(file):
                continue
            with open(file_path, 'r', encoding='utf-8') as f:
                line = f.readline()
                while line:
                    word = line.strip('\n').strip()
                    jieba.add_word(word)
                    # print(c_char_p(word.encode()))
                    # pynlpir.nlpir.AddUserWord(c_char_p(word.encode()))
                    line = f.readline()

        # 加载ltp模型
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parse_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if postag_flag or ner_flag or parse_flag:
            print('load model failed!')

    def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas

    def postag(self, lemmas):
        """对分词后的结果进行词性标注
        Args:
            lemmas: list,分词后的结果
            entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns:
            words: WordUnit list,包含分词与词性标注结果
        """
        words = []  # 存储句子处理后的词单元
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release()  # 释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word: str,单词
        Returns:
            post_tag: str,该单词的词性标注
        """
        post_tag = self.postagger.postag([
            word,
        ])
        return post_tag[0]

    def netag(self, words):
        """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Args:
            words: WordUnit list,包含分词与词性标注结果
        Returns:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标书结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        # print('\t'.join(netags))  # just for test
        words_netag = EntityCombine().combine(words, netags)
        # self.recognizer.release()  # 释放
        return words_netag

    def parse(self, words):
        """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns:
            *: SentenceUnit,该句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        # self.parser.release()
        return SentenceUnit(words)

    def close(self):
        """关闭与释放nlp"""
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
Esempio n. 19
0
class Parser:
    def __init__(self):
        os.environ['STANFORD_PARSER'] = STANFORD_PARSER_PATH
        os.environ['STANFORD_MODELS'] = STANFORD_MODELS_PATH
        os.environ['JAVAHOME'] = JAVA_HOME
        stanford_model_path = CHINESE_MODEL_PATH
        self.s_parser = stanford.StanfordParser(model_path=stanford_model_path)

        par_model_path = os.path.join(
            LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

        from pyltp import Parser
        self.parser = Parser()  # 初始化实例
        self.parser.load(par_model_path)  # 加载模型

        cws_model_path = os.path.join(LTP_DATA_DIR,
                                      'cws.model')  # 分词模型路径,模型名称为`cws.model`

        from pyltp import Segmentor
        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(cws_model_path)  # 加载模型

        pos_model_path = os.path.join(LTP_DATA_DIR,
                                      'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

        from pyltp import Postagger
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型

        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        from pyltp import NamedEntityRecognizer
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型

        q_words = {
            'q1_person': ['谁', '那个', '哪个'],
            'q1_time': ['那年', '时间', '哪年', '何时', '多久', '时候', '年'],
            'q1_amount': ['多', '几', '多少', '第几'],
            'q1_place': ['哪儿', '哪家', '哪里人', '哪里', '那家', '那里人', '那里'],
            'q1_result': ['怎么', '为什么', '为何', '如何', '何'],
            'q1_judge': ['是否', '还是', '吗'],
            'q0_other': ['哪些', '那些', '干什么'],
            'q0_definition': ['什么样', '什么', '怎么样', '怎样'],
        }
        self.question_words = []
        self.word2key = {}

        for k, v in q_words.items():
            self.question_words += v
            for _v in v:
                self.word2key[_v] = k

        self.stop_words = set()
        with open('../data/all-stop-word.txt') as f_stop:
            for i in f_stop.readlines():
                self.stop_words.add(i.strip())
        self.articles = []

    def cut_sentence(self, sent, stop=False):
        """
        句子分词
        :param sent: 
        :param stop: 
        :return: 
        """
        if stop:
            words = list(
                filter(lambda x: x not in self.stop_words,
                       list(self.segmentor.segment(sent.strip()))))
        else:
            words = list(self.segmentor.segment(sent.strip()))
        return words

    def get_question_type(self, question):
        """
        获取问题类型
        :param question: 
        :return: 
        """
        q_type = ''
        words = self.cut_sentence(question)
        flag = False
        for w in self.question_words:
            if w in words:
                flag = True
                q_type = self.word2key[w]
                break
        if not flag:
            # print(i, words)
            q_type = 'other'
        print(q_type)

    def word_count(self, sentences):
        """
        篇章中的词频统计
        :param sentences: 句子列表
        :return: 
        """
        all_words = []
        for i in sentences:
            all_words += self.cut_sentence(i, True)
        word_count = {}
        for i in all_words:
            if i in word_count:
                word_count[i] += 1
            else:
                word_count[i] = 1
        return word_count, sum(word_count.values())

    def read_train_set(self, file_path):
        """
        读取测试文件
        :param file_path: 文件路径
        :return: 
        """
        with open(file_path) as f_in:
            last_q = ''
            article = {'question': '', 'result': '', 'sentences': []}
            for i in f_in.readlines():
                line = i.strip().split('\t')
                if last_q == line[1]:
                    article['sentences'].append(line[2])
                    if int(line[0]) == 1:
                        article['result'] = line[2]
                else:
                    self.articles.append(article)
                    article = {
                        'question': line[1],
                        'result': '',
                        'sentences': []
                    }
                last_q = line[1]
            self.articles.append(article)
        self.articles = self.articles[1:]
        print(len(self.articles))
        print(self.articles[0])

    def tf_idf(self):
        with open('../data/question_word.txt') as f_in:
            pass

    def analysis_question(self, index, debug=True):
        if len(self.articles) <= 0:
            return
        article = self.articles[index]
        q_words = self.cut_sentence(article['question'], True)
        true_result = ''.join(self.cut_sentence(article['result'], True))
        if debug:
            print('q', self.cut_sentence(article['question'], True))
            print('q', article['question'])

            print('a', self.cut_sentence(article['result'], True))
            print('a', true_result)
        # print(q_words)
        # 候选答案句切词
        l_words = [
            self.cut_sentence(line, True) for line in article['sentences']
        ]
        # 计算关键词idf
        idf = {}
        for word in q_words:
            count = 0
            for line in l_words:
                if word in line:
                    count += 1
            idf[word] = count
        idf = {
            k:
            math.log(len(l_words) * 1.0 / (v + 1)) if len(l_words) > 0 else 0
            for k, v in idf.items()
        }
        # print(idf)

        line2score = {}
        for line in l_words:
            score = 0
            for word in q_words:
                # 计算关键词tf
                tf = 0
                delta = 1
                if len(re.findall('\d+', word)) > 0:
                    delta = 3
                for i in line:
                    if i == word:
                        tf += 1
                if len(line) == 0:
                    tf = 0
                else:
                    tf = (tf * 1.0 * delta) / len(line)
                score += tf * idf[word]
            line2score[''.join(line)] = score
        res = sorted(line2score.items(), key=lambda x: x[1], reverse=True)
        if debug:
            for i in res:
                print(i[1], i[0])
        if len(res) > 0:
            for i in range(len(res)):
                if res[i][0] == true_result:
                    return i + 1
            return 0
        else:
            return 0
Esempio n. 20
0
class PreTrain(object):
    relationsMapping = {
        'other': 0,
        'locaA': 1,
        'locAa': 2,
        'med-ill': 3,
        'ill-med': 4,
        "clsaA": 5,
        "clsAa": 6,
        "w-c": 7,
        "c-w": 8,
        "cs-ef": 9,
        "ef-cs": 10
    }
    distanceMapping = {'PADDING': 0, 'LowerMin': 1, 'GreaterMax': 2}
    minDistance = -100
    maxDistance = 100

    maxSentenceLen = 100
    max_distance = 204

    parser = Parser()  # 初始化实例

    def __init__(self, w2vmodel_path):
        self.parser.load("../LTP/parser.model")  # 加载模型
        self.model = models.Word2Vec.load(w2vmodel_path)
        self.model_vocab = self.model.wv.vocab
        self.model_embedding = self.model.wv.get_keras_embedding(False)
        for dis in range(self.minDistance, self.maxDistance + 1):
            self.distanceMapping[dis] = len(self.distanceMapping)
        print(len(self.distanceMapping))

    def load_w2vEmb(self):
        return self.model

    def sentence_w2v(self, pos1, pos2, sentence):
        pos1 = int(pos1)
        pos2 = int(pos2)
        sdp = np.zeros(self.maxSentenceLen, dtype=np.float32)
        tokenidxs = np.zeros(self.maxSentenceLen)
        positionValues1 = np.zeros(self.maxSentenceLen)
        positionValues2 = np.zeros(self.maxSentenceLen)
        tokens = str(sentence).split(" ")
        words = tokens.copy()
        flags = []
        slen = len(tokens)
        for idx in range(0, slen):
            sdp[idx] = 0.3
            distance1 = idx - int(pos1)
            distance2 = idx - int(pos2)
            if distance1 in self.distanceMapping:
                positionValues1[idx] = self.distanceMapping[distance1]
            elif distance1 <= self.minDistance:
                positionValues1[idx] = self.distanceMapping['LowerMin']
            else:
                positionValues1[idx] = self.distanceMapping['GreaterMax']

            if distance2 in self.distanceMapping:
                positionValues2[idx] = self.distanceMapping[distance2]
            elif distance2 <= self.minDistance:
                positionValues2[idx] = self.distanceMapping['LowerMin']
            else:
                positionValues2[idx] = self.distanceMapping['GreaterMax']

            if idx == pos1 or idx == pos2:
                flags.append("kej")
            else:
                flags.append(pseg.lcut(tokens[idx])[0].flag)

            if not self.model.__contains__(tokens[idx]):
                temp = jieba.lcut(tokens[idx])
                tokens[idx] = temp[len(temp) - 1]
                if not self.model.__contains__(tokens[idx]):
                    # print(str(idx) + " " + str(tokens))
                    # print(tokens[idx])
                    tokens[idx] = 'UNKNOWN_WORD'
            tokenidxs[idx] = self.model_vocab[tokens[idx]].index

        arcs = self.parser.parse(words, flags)  # 句法分析
        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        # for i in range(len(words)):
        #     print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation)
        iter_idx = pos1
        while True:
            if arcs[iter_idx].relation != "HED":
                sdp[iter_idx] = 0.8
                iter_idx = (arcs[iter_idx].head - 1)
            else:
                sdp[iter_idx] = 0.8
                break
        iter_idx = pos2
        while True:
            if arcs[iter_idx].relation != "HED":
                sdp[iter_idx] = 0.8
                iter_idx = (arcs[iter_idx].head - 1)
            else:
                sdp[iter_idx] = 0.8
                break

        # for i in range(len(words)):
        #     print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation + " " + str(sdp[i]))
        return tokenidxs, positionValues1, positionValues2, sdp

    # def sentence_w2v(self, pos1, pos2, sentence):
    #     pos1 = int(pos1)
    #     pos2 = int(pos2)
    #     sdp = np.zeros(self.maxSentenceLen, dtype=np.float32)
    #     tokenidxs = np.zeros(self.maxSentenceLen)
    #     positionValues1 = np.zeros(self.maxSentenceLen)
    #     positionValues2 = np.zeros(self.maxSentenceLen)
    #     tokens = str(sentence).split(" ")
    #     #   words 是 tokens 的副本
    #     words = tokens.copy()
    #     flags = []
    #     slen = len(tokens)
    #     for idx in range(0, slen):
    #         sdp[idx] = 0.3
    #         distance1 = idx - int(pos1)
    #         distance2 = idx - int(pos2)
    #         if distance1 in self.distanceMapping:
    #             positionValues1[idx] = self.distanceMapping[distance1]
    #         elif distance1 <= self.minDistance:
    #             positionValues1[idx] = self.distanceMapping['LowerMin']
    #         else:
    #             positionValues1[idx] = self.distanceMapping['GreaterMax']
    #
    #         if distance2 in self.distanceMapping:
    #             positionValues2[idx] = self.distanceMapping[distance2]
    #         elif distance2 <= self.minDistance:
    #             positionValues2[idx] = self.distanceMapping['LowerMin']
    #         else:
    #             positionValues2[idx] = self.distanceMapping['GreaterMax']
    #
    #         if idx == pos1 or idx == pos2:
    #             flags.append("kej")
    #         else:
    #             flags.append(pseg.lcut(tokens[idx])[0].flag)
    #
    #         if not self.model.__contains__(tokens[idx]):
    #             temp = jieba.lcut(tokens[idx])
    #             tokens[idx] = temp[len(temp) - 1]
    #             if not self.model.__contains__(tokens[idx]):
    #                 # print(str(idx) + " " + str(tokens))
    #                 # print(tokens[idx])
    #                 tokens[idx] = 'UNKNOWN_WORD'
    #         tokenidxs[idx] = self.model_vocab[tokens[idx]].index
    #
    #     arcs = parser.parse(words, flags)  # 句法分析
    #     # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    #     # for i in range(len(words)):
    #     #     print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation)
    #     iter_idx = pos1
    #     while True:
    #         if arcs[iter_idx].relation != "HED":
    #             sdp[iter_idx] = 0.8
    #             iter_idx = (arcs[iter_idx].head - 1)
    #         else:
    #             sdp[iter_idx] = 0.8
    #             break
    #     iter_idx = pos2
    #     while True:
    #         if arcs[iter_idx].relation != "HED":
    #             sdp[iter_idx] = 0.8
    #             iter_idx = (arcs[iter_idx].head - 1)
    #         else:
    #             sdp[iter_idx] = 0.8
    #             break
    #
    #     # for i in range(len(words)):
    #     #     print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation + " " + str(sdp[i]))
    #     return tokenidxs, positionValues1, positionValues2, sdp

    # def get_max_sentence_len(self,lines):
    #     maxlen = 0
    #     for line in lines:
    #         splits = line.strip().split('\t')
    #         sentence = splits[3]
    #         tokens = sentence.split(" ")
    #         maxlen = max(maxlen, len(tokens))
    #     return maxlen

    def process_one_input(self, input):
        temps = str(input).split("\t")
        relation = temps[0]
        pos1 = temps[1]
        pos2 = temps[2]
        sentence = temps[3].strip()
        tokenidxs, positionValues1, positionValues2, sdp = self.sentence_w2v(
            pos1, pos2, sentence)
        return self.relationsMapping[
            relation], tokenidxs, positionValues1, positionValues2, sdp

    def process_file(self,
                     file,
                     saveFlag=True,
                     savepath='../pkl/sem-relations.pkl.gz'):
        relationidxs = []
        positionMatrix1 = []
        positionMatrix2 = []
        tokenMatrix = []
        sdpMatrix = []
        with codecs.open(file, "r", "utf8") as rd:
            lines = rd.readlines()
            # self.maxSentenceLen = self.get_max_sentence_len(lines)
            for line in lines:
                #检查长度
                if len(line.split("\t")[3].split(" ")) > self.maxSentenceLen:
                    print("超过长度")
                    continue
                relationidx, tokenidxs, positionValues1, positionValues2, sdp = self.process_one_input(
                    line)
                relationidxs.append(relationidx)
                positionMatrix1.append(positionValues1)
                positionMatrix2.append(positionValues2)
                tokenMatrix.append(tokenidxs)
                sdpMatrix.append(sdp)
        relationidxs = np.asarray(relationidxs, dtype='int32')
        positionMatrix1 = np.asarray(positionMatrix1, dtype='int32')
        positionMatrix2 = np.asarray(positionMatrix2, dtype='int32')
        tokenMatrix = np.asarray(tokenMatrix, dtype='int32')
        sdpMatrix = np.asarray(sdpMatrix, dtype='float32')
        if saveFlag:
            self.save_pkl(relationidxs, positionMatrix1, positionMatrix2,
                          tokenMatrix, sdpMatrix, savepath)
        return relationidxs, positionMatrix1, positionMatrix2, tokenMatrix

    def save_pkl(self, relationidxs, positionMatrix1, positionMatrix2,
                 tokenMatrix, sdpMatrix, save_path):
        data = {
            'relationidxs': relationidxs,
            'positionMatrix1': positionMatrix1,
            'positionMatrix2': positionMatrix2,
            'tokenMatrix': tokenMatrix,
            "sdpMatrix": sdpMatrix
        }
        f = gzip.open(save_path, 'wb')
        pkl.dump(data, f)
        f.close()

    def process_one(self, line):
        # self.maxSentenceLen = 78
        if len(line.split("\t")[3].split(" ")) > self.maxSentenceLen:
            print("超过长度")
            return None
        relationidx, tokenidxs, positionValues1, positionValues2 = self.process_one_input(
            line)
        relationidx = np.asarray(relationidx, dtype='int32')
        positionMatrix1 = np.asarray(positionValues1, dtype='int32')
        positionMatrix2 = np.asarray(positionValues2, dtype='int32')
        tokenMatrix = np.asarray(tokenidxs, dtype='int32')
        tokenMatrix = tokenMatrix.reshape((1, self.maxSentenceLen))
        positionMatrix1 = positionMatrix1.reshape((1, self.maxSentenceLen))
        positionMatrix2 = positionMatrix2.reshape((1, self.maxSentenceLen))
        return relationidx, positionMatrix1, positionMatrix2, tokenMatrix


# pre = PreTrain("../w2vmodel/word2vec2.model")
# pre.process_file("../files/train.txt",True,'../pkl/train2.pkl.gz')
# pre.sentence_w2v(2,4,"入宫 为 魏孝文帝 和 文明太后 治过 病 , 多有 疗效")
from pyltp import SentenceSplitter
from scipy.spatial.distance import cosine
from bert_serving.client import BertClient

cws_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'cws.model')
pos_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'pos.model')
par_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model',
                              'parser.model')
ner_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'ner.model')

say_words_path = os.path.join(os.path.abspath('./'), 'data',
                              'saying_words.pickle')
segmentor = Segmentor()  # 分词
postagger = Postagger()  # 词性标注
recognizer = NamedEntityRecognizer()  # 命名主体识别
parser = Parser()  # 依存分析
segmentor.load(cws_model_path)
postagger.load(pos_model_path)
recognizer.load(ner_model_path)
parser.load(par_model_path)

# load saying words
say_words = pickle.load(open(say_words_path, 'rb'))

# 句子依存分析


def parsing(sentence):
    words = segmentor.segment(sentence)  # pyltp分词
    postags = postagger.postag(words)  # 词性标注
    arcs = parser.parse(words, postags)  # 句法分析
Esempio n. 22
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "/Users/rilzob/PycharmProjects/SubjectKG/ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        # 打印结果
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))
        # self.labeller.release()  # 释放模型
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        # print('words:', words)
        # print('postags:', postags)
        # print('arcs:', arcs)
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                # print('arc_index:', arc_index)
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    # print('arc_index.relation:', arcs[arc_index].relation)
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            # print('child_dict:', child_dict)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        # segment是将句子分词后的返回值并且使用list转换为Python的列表类型,原类型为native的VectorOfString
        postags = list(self.postagger.postag(words))
        # postag是将words进行词性标注的返回结果
        arcs = self.parser.parse(words, postags)
        # parse是进行依存句法分析
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        # # 较原来的版本修改的部分
        # old_child_dict_list, old_format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        # # print('child_dict_list:', child_dict_list)
        # # print('format_parse_list:', format_parse_list)
        # new_format_parse_list = old_format_parse_list
        #
        # # 找到中心词在old_format_parse_list的index
        # hed_num = 0  # 中心词的index
        # for format_parse in old_format_parse_list:
        #     if old_format_parse_list[0] == 'HED':
        #         hed_num = format_parse[2]
        #     else:
        #         continue
        #
        # # 找到被中心词所支配的主语
        # subject = ''  # 中心词的从属词
        # for format_parse in old_format_parse_list:
        #     if format_parse[0] == 'SBV' and format_parse[5] == hed_num:
        #         subject = old_format_parse_list[1]
        #     else:
        #         continue
        #
        # # 对原文进行修改,增加主语
        # for format_parse in old_format_parse_list:
        #     if format_parse[0] == 'ADV':
        #         if old_format_parse_list[format_parse[5]][0] == 'COO':
        #             new_format_parse_list.insert(format_parse[2], list(subject))
        #     else:
        #         continue
        #
        # #
        # for

        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list

    def supply_subject(self, old_format_parse_list):
        # 较原来的版本修改的部分
        # print('child_dict_list:', child_dict_list)
        # print('format_parse_list:', format_parse_list)
        new_format_parse_list = old_format_parse_list

        # 找到中心词在old_format_parse_list的index
        hed_num = 0  # 中心词的index
        for old_format_parse in old_format_parse_list:
            if old_format_parse[0] == 'HED':
                hed_num = old_format_parse[2]
            else:
                continue

        # 找到被中心词所支配的主语
        subject = ''  # 中心词的从属词
        for old_format_parse in old_format_parse_list:
            if old_format_parse[0] == 'SBV' and old_format_parse[5] == hed_num:
                subject = old_format_parse[1]
            else:
                continue

        # 对原文进行修改,增加主语
        for old_format_parse in old_format_parse_list:
            if old_format_parse[0] == 'ADV':
                if old_format_parse_list[old_format_parse[5]][0] == 'COO':
                    new_format_parse_list.insert(old_format_parse[2],
                                                 list(('', subject)))
            else:
                continue

        # 生成补充主语后的新句子
        string = ''
        for new_format_parse in new_format_parse_list:
            string = string + new_format_parse[1]

        return string
class LtpParser:
    def __init__(self):

        #initialize every ltp tool
        LTP_DIR = "E:\code_Athena_Support"

        #分词器
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        #词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        #依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        #命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        #语义角色标注模块
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):

        #依赖于词性的标注,做依存句法的分析
        #解释:
        #依存句法分析是基于词性标注的。
        arcs = self.parser.parse(words, postags)

        #根据依存句法的分析,标注语义角色
        roles = self.labeller.label(words, postags, arcs)

        #以字典储存,key为编号,value为列表
        #而且是嵌套字典,以arg.name作为key
        #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }

        print(roles_dict)
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):

        #其数据结构是:
        #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子
        child_dict_list = []

        #这个list的意义就是展示每个词的依存关系
        format_parse_list = []

        #一级循环:对每个词分析
        for index in range(len(words)):

            #预设孩子字典
            child_dict = dict()

            #二级循环:查每个词的语义角色
            for arc_index in range(len(arcs)):

                #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)

        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        '''显然这是一个类的主函数'''

        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Esempio n. 24
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "D:\ltp_data"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  # arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Esempio n. 25
0
import os
#import pandas as pd
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer

# Set your own model path

MODELDIR = "ltp_data"
print "正在加载LTP模型... ..."

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))

print "加载模型完毕。"

in_file_name = "input.txt"
out_file_name = "outlgh.txt"
begin_line = 1
end_line = 0


#df=pd.DataFrame(columns=["逻辑关系","实体一","关系","实体二"])
def extraction_start(in_file_name, out_file_name, begin_line, end_line):
Esempio n. 26
0
def answersemantic(resultwordlist, resultposlist):  # 根据ltp进行句法分析,转换为

    postagger = Postagger()  # 初始化实例
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
    postagger.load(pos_model_path)  # 加载模型

    parser = Parser()  # 初始化实例
    par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
    parser.load(par_model_path)  # 加载模型

    postags = postagger.postag(resultwordlist)  # 词性标注''
    poslist = []
    for i in postags:
        poslist.append(str(i))
    print(poslist)

    arcs = parser.parse(resultwordlist, poslist)

    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

    arcshead = []
    arcsrela = []
    for i in arcs:
        arcshead.append(i.head)
        arcsrela.append(i.relation)
    print(arcshead)
    print(arcsrela)
    semanticlist = []
    length = len(resultwordlist)
    poedictlist = []
    quenum = -1
    for i in range(0, len(resultposlist)):
        if resultposlist[i] == "question":
            quenum = i
    print("resultposlist,resultwordlist:    ", resultwordlist, resultposlist)
    for i in range(0, length):
        if resultposlist[i] in nertypelist:
            num = findproperty(i, arcshead, arcsrela, resultposlist)
            if num != -1:
                # resultposlist[arcshead[i]-1]=="property":#战狼2的上映日期是什么时候 mov的属性是
                # if arcsrela[i]=="ATT" or arcsrela[i]=="SBV":
                poedict = {}
                poedict["headnode"] = resultwordlist[i]
                poedict["headnodetype"] = resultposlist[i]
                if quenum == -1:
                    questr = ""
                else:
                    questr = questiondict[resultwordlist[quenum]]
                properresult = getrelation(propertydict[resultwordlist[num]],
                                           resultposlist[i], questr)
                endnodetype = getnodetype(propertydict[resultwordlist[num]],
                                          resultposlist[i], questr)
                poedict["relation"] = properresult
                poedict["endnode"] = ""
                poedict["endnodetype"] = endnodetype
                poedict["quesion"] = questr
                poedictlist.append(poedict)
    print(poedictlist)

    postagger.release()  # 释放模型
    parser.release()  # 释放模型
    return poedictlist
Esempio n. 27
0
class MyLTP():
    def __init__(self):
        ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
        # sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path
        # Set your own model path
        self.MODELDIR = os.path.join(ROOTDIR, "./ltp_data")
        # Init LTP Model
        self.segmentor = Segmentor()
        self.postagger = Postagger()
        self.parser = Parser()
        self.recognizer = NamedEntityRecognizer()
        self.labeller = SementicRoleLabeller()
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        self.labeller.load(os.path.join(self.MODELDIR, "pisrl.model"))

    # 下述函数返回值均为 list, list[0] 为第一个句子的运行结果
    # ---------------------------- 分词 -------------------------------
    def MySegmentor(self, paragraph):
        # 段落分成句子
        sentences = SentenceSplitter.split(paragraph)
        result = []
        for sentence in sentences:
            words = self.segmentor.segment(sentence)
            # 输出
            # print("\t".join(words))
            result.append(words)
        return result

    # ---------------------------- 词性标注 -------------------------------
    def MyPostagger(self, words):
        result = []
        for word in words:
            postags = self.postagger.postag(word)
            # list-of-string parameter is support in 0.1.5
            # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
            # 输出
            # print("\t".join(postags))
            result.append(postags)
        return result

    # ---------------------------- 依存句法分析 -------------------------------
    def MyParser(self, words, postags):
        result = []
        for index in range(0, len(words)):
            arcs = self.parser.parse(words[index], postags[index])
            # 输出
            # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
            result.append(arcs)
        return result

    # ---------------------------- 命名实体识别 -------------------------------
    def MyRecognizer(self, words, postags):
        result = []
        for index in range(0, len(words)):
            netags = self.recognizer.recognize(words[index], postags[index])
            # 输出
            # print("\t".join(netags))
            result.append(netags)
        return result

    # ---------------------------- 语义角色标注 -------------------------------
    def MyRoleLabller(self, words, postags, arcs):
        result = []
        for index in range(0, len(words)):
            roles = self.labeller.label(words[index], postags[index],
                                        arcs[index])
            # 输出
            # for role in roles:
            #     print(role.index, "".join(
            #             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
            result.append(roles)
        return result
Esempio n. 28
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity("../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag = self.segmentor.load_with_lexicon(os.path.join(default_model_dir, 'cws.model'), user_dict)
        # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        words = self.segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def splitSentence(self,text):
        pattern = r'。|!|?|;|='
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        #    print(result_list)
        return result_list

    def splitSentenceByComma(self,text):
        pattern = r','
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        final_list = []
        for sentence in result_list:
            if len(sentence) <= 40:
                final_list.append(sentence)
        return final_list

    def not_empty(self,s):
        return s and "".join(s.split())

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []

        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        location_position_list = ['主席','总统','总理','主任','内','东门','西门','南门','北门','大门','外','国家主席','尚书'
                                  ]
        if self.dsfnConstraints3(sentence,item1,item2) and (item1.dependency == "ATT" and item1.head_word.postag != 'v' and item1.head_word.postag != 'a'):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT" and AttWord.head_word.postag != 'v' and AttWord.head_word.postag != 'a' ):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (item.dependency == "ATT" and  item.postag != 'v' and item.postag != 'a'):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")")
                if AttWordStr in location_position_list:
                    allTripes.append([item1.lemma, AttWordStr, item2.lemma])


        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV" and item1.head_word.postag == "v":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB" and item2.head_word.postag == "v":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) ==2:
                    num = self.get_entity_num_between(pred1,pred2,sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred1.ID)  or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID):
                            flagVOB = False
                    flagCMP= True
                    if pred1!=None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID:
                        flagCMP = False
                    if pred2!=None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID:
                        flagCMP = False

                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0 :
                        if flagCMP == False :
                            if flagVOB == True and flagSBV == True:
                                allTripes.append([item1.lemma, pred1.lemma + "" +pred2.lemma, item2.lemma])
                        else:
                            if flagVOB == True:
                                allTripes.append([item1.lemma, pred1.lemma, item2.lemma])
                            if flagSBV == True:
                                allTripes.append([item1.lemma, pred2.lemma, item2.lemma])



        """
        DSFN3.0
        """
        pred = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(), key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            # print(
                            #     "DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")")
                            allTripes.append([item1.lemma, pred.lemma + "" + objectStr, item2.lemma])
                            # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + object.lemma + "," + item2.lemma + ")")
                            # allTripes.append([item1.lemma, pred.lemma + "" + object.lemma, item2.lemma])
                    if object == None:
                        # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                        allTripes.append([item1.lemma, pred.lemma , item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma])
                else :
                    num = self.get_entity_num_between(pred, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID):
                            flagVOB = False
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0 :
                        flag = True
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1 != None:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                                allTripes.append([item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            if flagSBV == True:
                                allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma])
                        else:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                                allTripes.append([item1.lemma, pred.lemma, item2.lemma])
                            if flagSBV == True:
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                                allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma])

        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self,verb1,verb2,sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID-1:
            if self.is_entity(sentence.words[i]):
                num +=1
            i +=1
        return num

    def is_entity(self,entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh','ni','ns','nz','j','n','v']
        # print(entry.lemma+" : "+entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False
    def dsfnAttCOO(self,sentence,item1,item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence,item1,item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence,item1COO,item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))
    def dsfn6COO(self,sentence,item1,item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence,item1,item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2
    def dsfn5and6COO(self,sentence,item1,item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence,item1COO,item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe
    def dsfnStart(self, rawSentence, entity1, entity2,all_entity):
        nounRelatedWithPosition = ['主席','总理','教授','校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        # print(sentence.to_string())
        Rawitem1 = None
        Rawitem2 = None
        item1 = None
        item2 = None
        Rawitem1Index = -1
        Rawitem2Index = -1
        indexList = [-1,-1]
        for item in sentence.words:
            if (item.lemma == entity1):
                Rawitem1 = item
            if (item.lemma == entity2):
                Rawitem2 = item
            if Rawitem1 != None and Rawitem2 != None and (Rawitem1.ID!=Rawitem1Index or Rawitem2.ID!=Rawitem2Index):
                Rawitem1Index = Rawitem1.ID
                Rawitem2Index = Rawitem2.ID
                # if item1 == None or item2 == None:
                #     return None
                item1 = Rawitem1
                item2 = Rawitem2
                if item1.ID > item2.ID:
                    c = item1
                    item1 = item2
                    item2 = c
                # print(str(item1.ID) + "   " + str(item2.ID))
                itemCopy1 = item1
                itemCopy2 = item2
                # print(item1.lemma)
                # print(item2.lemma)
                # print(self.dsfnConstraints2(sentence,item1,item2,all_entity))
                if self.dsfnConstraints2(sentence,item1,item2,all_entity) == False:
                    continue
                allTripes = self.dsfnStartCOO2(sentence,item1,item2)
                # print("111"+item2.lemma)
                if allTripes!=None and len(allTripes) == 0:
                    while item1.dependency == "ATT":
                        item1 = item1.head_word
                    while item2.dependency == "ATT":
                        item2 = item2.head_word
                    allTripes = self.dsfnStartCOO2(sentence, item1, item2)
                    if len(allTripes) != 0:
                        for tripe in allTripes:
                            if tripe[1]!= "":
                                if tripe[0] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[0] = item1.lemma+""+itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[0] = itemCopy1.lemma+""+item1.lemma
                                    else:
                                        tripe[0] = itemCopy1.lemma

                                elif tripe[2] == item1.lemma:
                                    if item1.ID < itemCopy1.ID:
                                        tripe[2] = item1.lemma+""+itemCopy1.lemma
                                    elif item1.ID > itemCopy1.ID:
                                        tripe[2] = itemCopy1.lemma+""+item1.lemma
                                    else:
                                        tripe[2] = itemCopy1.lemma
                                    # tripe[2] = itemCopy1.lemma

                                if tripe[0] == item2.lemma:
                                    if item2.ID < itemCopy2.ID:
                                        tripe[0] = item2.lemma + ""+ itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[0] = itemCopy2.lemma + ""+ item2.lemma
                                    else:
                                        tripe[0] = itemCopy2.lemma
                                elif tripe[2] == item2.lemma:
                                    # print(item2.lemma)
                                    if item2.ID < itemCopy2.ID:
                                        tripe[2] = item2.lemma + ""+ itemCopy2.lemma
                                    elif item2.ID > itemCopy2.ID:
                                        tripe[2] = itemCopy2.lemma + ""+ item2.lemma
                                    else:
                                        tripe[2] = itemCopy2.lemma
                                # print("12345")
                                resultList.append(tripe)
                else:
                    for tripe in allTripes:
                        if tripe[1]!="":
                            resultList.append(tripe)
                    # if len(resultList) > 0:
                    #     return np.array(set([tuple(t) for t in resultList]))
        if item1 == None or item2 == None:
            return None
        if len(resultList) > 0:
            # return np.array(set([tuple(t) for t in resultList]))
            # print("输出结果1"+str(resultList))
            return resultList
    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        # print(item1.lemma)
        # print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            # print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                # print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    # print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        # print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item
                            allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(sentence, item1, item2)
                                    if allTripes == None or len(allTripes) == 0:
                                        # print("3333333")
                                        allTripes = self.dsfn5and6COO(sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    resultList.append(tripe)
        # print(np.array(set([tuple(t) for t in resultList])))
        return resultList

    def dsfnConstraints1(self,rawSentence,maxLength):
        """
        :param rawSentence: 原句子
        :param maxLength: 句子的最大长度
        :return: 小于maxLength的长度
        """
        newSentence = []
        if len(rawSentence) <= maxLength:
            newSentence.append(rawSentence)
            return newSentence
        else:
            newSentence = self.splitSentenceByComma(rawSentence)
            return newSentence

    def dsfnConstraints2(self,sentence,item1,item2,allEntities):
        countEntity = 0
        countChar = 0
        for index in range(item1.ID+1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
            if word.lemma in allEntities:
                countEntity +=1
        # print(countEntity)
        # print(countChar)
        if countEntity > 3:
            return False
        elif countChar > 12:
            return False
        else:
            return True

    def dsfnConstraints3(self,sentence,item1,item2):
        countChar = 0
        for index in range(item1.ID+1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
        if countChar > 5:
            return False
        else:
            return True

    def getSPO(self,sentence):
        all_result = []
        raw_sentence = []
        RawSentence = sentence
        lemmas = self.segment(sentence)
        words = self.postag(lemmas)
        words_netag = self.netag(words)
        sentence = self.parse(words_netag)
        print(sentence.to_string())
        for itemWord in sentence.words:
            #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系
            if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and
                                                                  itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\
                     or (itemWord.postag == "v"):
                relation_verb = itemWord   #将找到的这个动词,作为relation_verb
                relationString = relation_verb.lemma
                if itemWord.head_word==None:
                    verbId = itemWord.ID   #关系动词的ID
                    verbId2 = None
                elif itemWord.head_word.head_word == None:
                    verbId = itemWord.ID   #该关系动词的ID
                    verbId2 = itemWord.head_word.ID   #这句话的HED,用来找SUB
                else:
                    verbId = itemWord.ID   #该关系动词的ID
                    verbId2 = None
                O_dict = dict() #存储所有的Object
                S_dict = dict() #存储所有的Subject
                verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲
                OBJ = None
                SUB = None
                for item in sentence.words:
                    if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语
                        # if SUB == None or SUB.lemma != entity:
                        SUB = item #找到主语
                        S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中

                    if (item.dependency == "VOB" and item.head_word.ID == verbId):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma
                        verb_dict[OBJ.ID] = relationString
                    if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                and item.head_word.head_word.ID== verbId) :
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma + "" + item.head_word.lemma
                        verb_dict[OBJ.ID] = relationString

                    if (item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString  = relation_verb.lemma
                        for eachWord in sentence.words:
                            if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID:
                                relationString = relation_verb.lemma + "" + eachWord.lemma
                        verb_dict[OBJ.ID] = relationString

                if SUB == None:#如果没找到主语,那么就找与该动词并列的verbId2的主语
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId2:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma

                if OBJ == None:
                    verb_coo = None
                    for item in sentence.words:
                        if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId:
                            verb_coo = item
                            break
                    flag = True
                    if verb_coo != None and self.get_entity_num_between(relation_verb,verb_coo,sentence) == 0:

                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID:
                                flag = False
                        if flag!= False:
                            for item in sentence.words:
                                if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\
                                        or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID):

                                    OBJ = item
                                    O_dict[OBJ.ID] = OBJ.lemma
                print(verb_dict)
                print(O_dict)
                SUB_COO = None
                OBJ_COO = None
                for item in sentence.words:
                    if item.head_word != None:
                        if SUB != None and item.dependency == "COO" and item.head_word.ID  in S_dict: #获得主语的COO
                            SUB_COO = item
                            S_dict[SUB_COO.ID] = SUB_COO.lemma
                    if item.head_word != None and OBJ!=None:
                        if item.dependency == "COO" and item.head_word.ID in O_dict: #获得宾语的COO
                            OBJ_COO = item
                            O_dict[OBJ_COO.ID] = OBJ_COO.lemma
                S_new = []

                for sub in S_dict:
                    if sentence.get_word_by_id(sub).postag == 'r':
                        continue
                    S_dict2 = dict()  # 存放主语ATT的列表
                    S_dict2[sub] = S_dict[sub]
                    flag = True
                    while flag == True:
                        len1 = len(S_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                SUBList = S_dict2.keys()
                                if item.head_word.ID in SUBList and (item.dependency == "ATT" or item.dependency == "ADV"):
                                    SUBATT = item
                                    S_dict2[SUBATT.ID] = SUBATT.lemma

                            if len(S_dict2) != len1 :
                                flag = True
                            else:
                                flag = False
                    S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0])
                    Subject = ""
                    for i in S_dict2:
                        Subject += i[1]
                    S_new.append(Subject)

                O_new = []
                V_new = []
                for obj in O_dict:
                    if sentence.get_word_by_id(obj).postag == 'r':
                        continue
                    O_dict2 = dict()  # 存放宾语ATT的列表
                    O_dict2[obj] = O_dict[obj]
                    if verb_dict!=None:
                        if obj in verb_dict:
                            relationString2  = verb_dict[obj]
                        else:
                            relationString2 = relation_verb.lemma
                    else:
                        relationString2 = relation_verb.lemma
                    V_new.append(relationString2)
                    flag = True
                    while flag == True:
                        len2 = len(O_dict2)
                        for item in sentence.words:
                            if item.head_word != None:
                                OBJList = O_dict2.keys()
                                if item.head_word.ID in OBJList and (item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB"):
                                    OBJATT = item
                                    O_dict2[OBJATT.ID] = OBJATT.lemma

                            if len(O_dict2) != len2:
                                flag = True
                            else:
                                flag = False #一直循环,直到找不到新的修饰词
                    O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0])
                    Object = ""
                    for i in O_dict2:
                        Object += i[1]
                    O_new.append(Object)
                print(O_dict)
                print(O_new)
                for sub in S_new:
                    for i in range(0,len(O_new)):
                        obj = O_new[i]
                        relationWord = V_new[i]
                        if obj != "":
                            # print(RawSentence)
                            # print((sub, relationWord, obj))
                            all_result.append([sub,relationWord,obj])
                            raw_sentence.append(RawSentence)

        return all_result,raw_sentence

    def hasEntity(self,word,allEntity):
        for entity in allEntity:
            if entity in word:
                # print(entity)
                return True
        return False

    def PostProcessSPO(self,rawSentence,allTripes,allEntity):
        output_list = []
        for i in range(0,len(allTripes)):
            tripe = allTripes[i]
            sub = tripe[0]
            obj = tripe[2]
            # print(sub)
            # print(obj)
            if self.hasEntity(sub,allEntity) and self.hasEntity(obj,allEntity):
                output_list.append(tripe)
        return output_list
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import Parser
import name_convert as nc
import title_to_tree as tot
import predict as pt

segmentor = Segmentor()
segmentor.load_with_lexicon("./system/ltp_data/cws.model","./system/ltp_data/plain.txt")

postagger = Postagger()
postagger.load_with_lexicon("./system/ltp_data/pos.model","./system/ltp_data/postagger.txt")

parser = Parser()
parser.load("./system/ltp_data/parser.model")

def get_result(company_name,news_titles):
	title_tree = []
	for sentence in news_titles:
		words = segmentor.segment(sentence)
		words = nc.convert(words)
		if company_name not in words:
			add_word = [company_name, u':'.encode('utf8')]
			add_word.extend(words)
			words = add_word
		# print ("\t".join(words))
		postags = postagger.postag(words)
		# print ("\t".join(postags))
		arcs = parser.parse(words,postags)
Esempio n. 30
0
class Extractor():
    def __init__(self):
        self.__clause_list = []
        self.__subclause_dict = {}
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__labeller = SementicRoleLabeller()
        self.__words_full_list = []
        self.__netags_full_list = []

    @property
    def clause_list(self):
        return self.__clause_list

    @property
    def triple_list(self):
        return self.__triple_list

    def split(self, words, postags):
        start = 0
        for j, w in enumerate(words):
            if w == ',' or w == ',' or w == '。':
                clause = Clause(start, j - 1)
                self.__clause_list.append(clause)
                start = j + 1

        for clause in self.__clause_list:
            clause.split(postags)
            for subclause in clause.sub_clause_list:
                self.add_inverted_idx(subclause)

    def add_inverted_idx(self, subclause):
        for i in range(subclause.start_idx, subclause.end_idx):
            self.__subclause_dict[i] = subclause

    def load(self):
        self.__segmentor.load('ltp_data/cws.model')
        self.__postagger.load('ltp_data/pos.model')
        self.__recognizer.load('ltp_data/ner.model')
        self.__parser.load('ltp_data/parser.model')
        self.__labeller.load('ltp_data/srl')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()
        self.__labeller.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []

    def resolve_conference(self, entity):
        try:
            e_str = entity.get_content_as_str()
        except Exception:
            return '?'
        ref = e_str
        if e_str == '他' or e_str == '她':
            for i in range(entity.loc, -1, -1):
                if self.__netags_full_list[i].lower().endswith('nh'):
                    ref = self.__words_full_list[i]
                    break
        return ref

    def resolve_all_conference(self):
        for t in self.triple_list:
            e_str = self.resolve_conference(t.entity_1)
            try:
                t.entity_1.content = e_str.split()
            except Exception:
                pass

    def sentenceSplite(self, text):
        return SentenceSplitter.split(text)

    def chunk_str(self, data):
        sents = self.sentenceSplite(data)
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                roles = self.__labeller.label(words, postags, netags, arcs)
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
            except Exception as e:
                print(str(e))
                pass

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i for i, x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [
            i for i, x in enumerate(arcs)
            if x.head == root and x.relation == 'COO'
        ]
        relations.insert(0, root)

        prev_e1 = None
        e1 = None
        for rel in relations:

            left_arc = [
                i for i, x in enumerate(arcs)
                if x.head == rel and x.relation == 'SBV'
            ]

            if len(left_arc) > 1:
                pass
                #raise Exception('More than 1 left arc is detected!')
            elif len(left_arc) == 0:
                e1 = prev_e1
            elif len(left_arc) == 1:
                left_arc = left_arc[0]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1,
                            [words[i] for i in range(leftmost, left_arc + 1)],
                            offset + leftmost)

            prev_e1 = e1

            right_arc = [
                i for i, x in enumerate(arcs)
                if x.head == rel and x.relation == 'VOB'
            ]

            e2_list = []
            if not right_arc:
                e2 = Entity(2, None)
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [
                    i for i, x in enumerate(arcs)
                    if x.head == right_ext and x.relation == 'COO'
                ]
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)

                    e2 = None

                    if count == 0:
                        e2 = Entity(
                            2,
                            [words[i] for i in range(leftmost, right_ext + 1)],
                            offset + leftmost)
                    else:
                        p1 = range(leftmost, right_arc[0])
                        p2 = range(item, find_farthest_vob(arcs, item) + 1)
                        e2 = Entity(
                            2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    r = Relation(words[rel])
                    t = Triple(e1, e2, r)
                    self.__triple_list.append(t)
                    count += 1

    def segment(self, sentence):
        words = self.__segmentor.segment(sentence)
        return words

    def postag(self, words):
        postags = self.__postagger.postag(words)
        return postags

    def parse(self, words, postags):
        arcs = self.__parser.parse(words, postags)
        return arcs

    def recognize(self, words, postags):
        netags = self.__recognizer.recognize(words, postags)
        return netags

    def label(self, words, postags, netags, arcs):
        roles = self.__labeller.label(words, postags, netags, arcs)
        return roles
        format = 'json'
        pattern = 'all'
        result = urllib2.urlopen("%sapi_key=%s&text=%s&format=%s&pattern=%s" % (url_get_base,api_key,text,format,pattern))
        content = result.read().strip()
    # print content
        return json.loads(content)[0]
    else:
        aa= []
        return aa


segmentor = Segmentor()
segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt")
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))



#分析每句
def callLTP(sentence):
    words = segmentor.segment(sentence)
    postags = postagger.postag(words)
    arcs = parser.parse(words, postags)
    resultJson=[]
    for index in range(len(words)):
        resultJson.append({'id':index,'cont':words[index],'pos':postags[index],'relate':arcs[index].relation,'parent':arcs[index].head - 1})
    return resultJson

#分析每行,调用callLTP
Esempio n. 32
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
    location_entity = [
        "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿",
        "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫",
        "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清",
        "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河",
        "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫",
        "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝",
        "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林",
        "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城",
        "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛",
        "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城",
        "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山",
        "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山",
        "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼",
        "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川",
        "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼",
        "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街",
        "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门",
        "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海",
        "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石",
        "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔",
        "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿",
        "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿",
        "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪",
        "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼",
        "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方",
        "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛",
        "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平",
        "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头",
        "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁",
        "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园",
        "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿",
        "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿",
        "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜",
        "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫",
        "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛",
        "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙",
        "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫"
    ]

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor_user = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag_user = self.segmentor_user.load_with_lexicon(
            os.path.join(default_model_dir, 'cws.model'), user_dict)
        self.segmentor = Segmentor()
        segmentor_flag = self.segmentor.load(
            os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag or segmentor_flag_user:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, segmentor, entity_postag=dict()):
        words = segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def splitSentence(self, text):
        pattern = r'。|!|?|;|='
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        #    print(result_list)
        return result_list

    def splitSentenceByComma(self, text):
        pattern = r','
        result_list = re.split(pattern, text)
        result_list = list(filter(self.not_empty, result_list))
        final_list = []
        for sentence in result_list:
            if len(sentence) <= 40:
                final_list.append(sentence)
        return final_list

    def not_empty(self, s):
        return s and "".join(s.split())

    def dsfn1_2_3_4COO(self, sentence, item1, item2, flagCOOATT):
        allTripes = []
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        # print(item1.lemma)
        # print(item2.lemma)
        # print(flagCOOATT)

        if flagCOOATT == False:
            location_position_list = getAttWord()
            # print(location_position_list)
            if self.dsfnConstraints3(sentence, item1,
                                     item2) and (item1.dependency == "ATT"):
                AttWord = item1.head_word
                AttWordDict = dict()
                AttWordStr = ""
                while AttWord.ID < item2.ID:
                    AttWordDict[AttWord.ID] = AttWord.lemma
                    # print(AttWord.lemma)
                    # AttWordStr += AttWord.lemma
                    if (AttWord.dependency == "ATT"):
                        AttWord = AttWord.head_word
                    else:
                        break

                if (AttWord.ID == item2.ID):
                    flag = True
                    while flag:
                        len1 = len(AttWordDict)
                        AttList = AttWordDict.keys()
                        for id in range(item1.ID + 1, item2.ID):
                            item = sentence.get_word_by_id(id)
                            if item.head_word != None and item.head_word.ID in AttList and (
                                    item.dependency == "ATT"):
                                AttWordDict[item.ID] = item.lemma
                                # print(item.lemma)
                        if len1 == len(AttWordDict):
                            flag = False
                        else:
                            flag = True
                    AttWordDict = sorted(AttWordDict.items(),
                                         key=lambda item: item[0])
                    AttWordStr = ""
                    for i in AttWordDict:
                        AttWordStr += i[1]
                    # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")")

                    if AttWordStr in location_position_list:
                        allTripes.append(
                            [item1.lemma, AttWordStr, item2.lemma])
                        # print(allTripes)
                        # print("-------------------------")
                    # else:
                    #     for attWord in location_position_list:
                    #         if attWord in AttWordStr:
                    #             allTripes.append([item1.lemma, AttWordStr, item2.lemma])
                    #             print(allTripes)
                    #
                    #             print("-------------------------")
        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV" and item1.head_word.postag == "v":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB" and item2.head_word.postag == "v":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) == 2:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred1.ID)  or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID):
                            flagVOB = False
                    flagCMP = True
                    if pred1 != None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID:
                        flagCMP = False
                    if pred2 != None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID:
                        flagCMP = False
                    flagCOO = True
                    if pred1 != None and pred1.dependency == "COO" and pred1.head_word.ID == pred2.ID:
                        flagCOO = False
                    if pred2 != None and pred2.dependency == "COO" and pred2.head_word.ID == pred1.ID:
                        flagCOO = False

                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        if flagCMP == False:
                            if flagVOB == True and flagSBV == True:
                                allTripes.append([
                                    item1.lemma,
                                    pred1.lemma + "" + pred2.lemma, item2.lemma
                                ])
                        if flagCOO == False:
                            if flagVOB == True and flagSBV == True:
                                allTripes.append([
                                    item1.lemma,
                                    pred1.lemma + "" + pred2.lemma, item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                allTripes.append(
                                    [item1.lemma, pred1.lemma, item2.lemma])
                            if flagSBV == True:
                                allTripes.append(
                                    [item1.lemma, pred2.lemma, item2.lemma])
        """
        DSFN3.0
        """
        pred = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            allTripes.append([
                                item1.lemma, pred.lemma + "" + objectStr,
                                item2.lemma
                            ])

                    if object == None:
                        hasPOB = False
                        for i in range(pred.ID + 1, len(sentence.words) + 1):
                            item = sentence.get_word_by_id(i)
                            if item.dependency == "POB" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == pred.ID:
                                hasPOB = True
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" +
                                    item.head_word.lemma + "" + item.lemma,
                                    item2.lemma
                                ])
                        # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                        if hasPOB == False:
                            allTripes.append(
                                [item1.lemma, pred.lemma, item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([
                        item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma
                    ])
                else:
                    num = self.get_entity_num_between(pred, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \
                                and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID):
                            flagVOB = False
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        flag = True
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1 != None:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" + prep1.lemma,
                                    item2.lemma
                                ])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            if flagSBV == True:
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                                allTripes.append(
                                    [item1.lemma, pred.lemma, item2.lemma])
                            if flagSBV == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self, verb1, verb2, sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID - 1:
            if self.is_entity(sentence.words[i]):
                num += 1
            i += 1
        return num

    def is_entity(self, entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm']
        # print(entry.lemma+" : "+entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2, True)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))
    def dsfn6COO(self, sentence, item1, item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO, True)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2

    def dsfn5and6COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO,
                                               True)
                for tripe in allTripe:

                    if tripe[0] == item1COO.lemma and tripe[
                            2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[
                            0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe

    def dsfnStart(self, rawSentence, segmentor, entity1, entity2, all_entity):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence, segmentor)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        # print(sentence.to_string())
        Rawitem1 = None
        Rawitem2 = None
        item1 = None
        item2 = None
        Rawitem1Index = -1
        Rawitem2Index = -1
        indexList = [-1, -1]
        for item in sentence.words:
            # print(str(item.ID) + " " +item.lemma )
            if (item.lemma == entity1):
                Rawitem1 = item
            if (item.lemma == entity2):
                Rawitem2 = item
            if Rawitem1 != None and Rawitem2 != None and (
                    Rawitem1.ID != Rawitem1Index
                    or Rawitem2.ID != Rawitem2Index):
                Rawitem1Index = Rawitem1.ID
                Rawitem2Index = Rawitem2.ID
                # print(str(Rawitem1Index) +" " +str(Rawitem2Index))
                # if item1 == None or item2 == None:
                #     return None
                item1 = Rawitem1
                item2 = Rawitem2
                if item1.ID > item2.ID:
                    c = item1
                    item1 = item2
                    item2 = c
                # print(str(item1.ID) + "   " + str(item2.ID))
                itemCopy1 = item1
                itemCopy2 = item2
                # print(item1.lemma)
                # print(item2.lemma)
                # print(self.dsfnConstraints2(sentence,item1,item2,all_entity))
                if self.dsfnConstraints2(sentence, item1, item2,
                                         all_entity) == False:

                    continue
                allTripes = self.dsfnStartCOO2(sentence, item1, item2, False)
                if allTripes != None:
                    for tripe in allTripes:
                        if tripe[1] != "":
                            resultList.append(tripe)
        if item1 == None or item2 == None:
            return None
        if len(resultList) > 0:
            return resultList

    def dsfnStartCOO2(self, sentence, item1, item2, flagCOOATT):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        # print(item1.lemma)
        # print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2, flagCOOATT)
        if len(allTripes) == 0:
            # print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                # print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    # print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        # print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID
                                == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item
                            # print(sentence.to_string())
                            # print(item1.lemma)
                            # print(item2.lemma)
                            allTripes = self.dsfn1_2_3_4COO(
                                sentence, item1, item2, flagCOOATT)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(
                                    sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(
                                        sentence, item1, item2)
                                    if allTripes == None or len(
                                            allTripes) == 0:
                                        # print("3333333")
                                        allTripes = self.dsfn5and6COO(
                                            sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    resultList.append(tripe)
        # print(np.array(set([tuple(t) for t in resultList])))
        return resultList

    def dsfnConstraints1(self, rawSentence, maxLength):
        """
        :param rawSentence: 原句子
        :param maxLength: 句子的最大长度
        :return: 小于maxLength的长度
        """
        newSentence = []

        if len(rawSentence) <= maxLength:
            newSentence.append(rawSentence)
            return newSentence
        else:
            newSentence = self.splitSentenceByComma(rawSentence)
            return newSentence

    def dsfnConstraints2(self, sentence, item1, item2, allEntities):
        countEntity = 0
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
            if word.lemma in allEntities:
                countEntity += 1
        # print(countEntity)
        # print(countChar)
        if countEntity > 3:
            return False
        elif countChar > 12:
            # print(countChar)
            return False
        else:
            return True

    def dsfnConstraints3(self, sentence, item1, item2):
        countChar = 0
        for index in range(item1.ID + 1, item2.ID):
            word = sentence.get_word_by_id(index)
            countChar += len(word.lemma)
        if countChar > 5:
            return False
        else:
            return True

    def getSPO(self, sentence, segmentor):
        all_result = []
        raw_sentence = []
        RawSentence = sentence
        lemmas = self.segment(sentence, segmentor)
        words = self.postag(lemmas)
        words_netag = self.netag(words)
        sentence = self.parse(words_netag)
        # print(sentence.to_string())
        for itemWord in sentence.words:
            #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系
            if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and
                                                                  itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\
                     or (itemWord.postag == "v") :
                relation_verb = itemWord  #将找到的这个动词,作为relation_verb
                relationString = relation_verb.lemma
                # print(relationString)
                if itemWord.head_word == None:
                    # print("1")
                    verbId = itemWord.ID  # 关系动词的ID
                    verbId2 = None
                elif itemWord.head_word.head_word == None:
                    # print("2")

                    verbId = itemWord.ID  #该关系动词的ID
                    if itemWord.dependency == "COO" or self.get_entity_num_between(
                            itemWord, itemWord.head_word, sentence) == 0:
                        verbId2 = itemWord.head_word.ID  # 这句话的HED,用来找SUB
                    else:
                        verbId2 = None
                else:
                    # print("3")
                    verbId = itemWord.ID  #该关系动词的ID
                    if itemWord.dependency == "COO" or self.get_entity_num_between(
                            itemWord, itemWord.head_word, sentence) == 0:
                        verbId2 = itemWord.head_word.ID  # 这句话的HED,用来找SUB
                    else:
                        verbId2 = None
                O_dict = dict()  #存储所有的Object
                S_dict = dict()  #存储所有的Subject
                verb_dict = dict()  #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲
                OBJ = None
                SUB = None
                DSFN3 = dict()
                for item in sentence.words:
                    if item.dependency == "SBV" and item.head_word.ID == verbId:  #寻找这个动词的主语
                        # if SUB == None or SUB.lemma != entity:
                        SUB = item  #找到主语
                        S_dict[SUB.ID] = SUB.lemma  #将主语加入到字典中

                    if (item.dependency == "VOB"
                            and item.head_word.ID == verbId
                            and item.postag != "v"):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma
                        verb_dict[OBJ.ID] = relationString
                    if (item.dependency == "POB"
                            and item.head_word.postag == "p"
                            and item.head_word.dependency == "CMP"
                            and item.head_word.head_word.ID == verbId):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        relationString = relation_verb.lemma + "" + item.head_word.lemma
                        verb_dict[OBJ.ID] = relationString

                    if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\
                            and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \
                            and item.postag!='v'):
                        # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词)
                        OBJ = item
                        O_dict[OBJ.ID] = OBJ.lemma
                        verbObj = None
                        DSFN3[OBJ.ID] = True
                        objectDict = dict()
                        relationString = relation_verb.lemma
                        for eachWord in sentence.words:
                            if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID:
                                # relationString = relation_verb.lemma + "" + eachWord.lemma
                                verbObj = eachWord
                                objectDict[verbObj.ID] = verbObj
                        if verbObj != None:
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            relationString = relation_verb.lemma + "" + objectStr

                        else:
                            for eachWord in sentence.words:
                                if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\
                                    eachWord.head_word.head_word.ID == relation_verb.ID:
                                    relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma

                        verb_dict[OBJ.ID] = relationString

                if SUB == None:  #如果没找到主语,那么就找与该动词并列的verbId2的主语
                    for item in sentence.words:
                        if item.dependency == "SBV" and item.head_word.ID == verbId2:
                            # if SUB == None or SUB.lemma != entity:
                            SUB = item
                            S_dict[SUB.ID] = SUB.lemma
                # print(verbId2)
                if OBJ == None:
                    verb_coo = None
                    for item in sentence.words:
                        if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId:
                            verb_coo = item
                            break
                    flag = True
                    if verb_coo != None and self.get_entity_num_between(
                            relation_verb, verb_coo, sentence) == 0:

                        for item in sentence.words:
                            if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID:
                                flag = False
                        if flag != False:
                            for item in sentence.words:
                                if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\
                                        or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP"
                                and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\
                        and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID):

                                    OBJ = item
                                    O_dict[OBJ.ID] = OBJ.lemma
                # print(S_dict)
                # print(verb_dict)
                # print(O_dict)
                SUB_COO = None
                OBJ_COO = None
                for item in sentence.words:
                    if item.head_word != None:
                        if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict:  #获得主语的COO
                            SUB_COO = item
                            S_dict[SUB_COO.ID] = SUB_COO.lemma
                    if item.head_word != None and OBJ != None:
                        if item.dependency == "COO" and item.head_word.ID in O_dict:  #获得宾语的COO
                            OBJ_COO = item
                            O_dict[OBJ_COO.ID] = OBJ_COO.lemma
                S_new = []

                for sub in S_dict:
                    S_new.append(S_dict[sub])

                O_new = []
                V_new = []
                for obj in O_dict:
                    if verb_dict != None:
                        if obj in verb_dict:
                            relationString2 = verb_dict[obj]
                        else:
                            relationString2 = relation_verb.lemma
                    else:
                        relationString2 = relation_verb.lemma
                    V_new.append(relationString2)
                    O_new.append(O_dict[obj])

                for sub in S_new:
                    for i in range(0, len(O_new)):
                        obj = O_new[i]
                        relationWord = V_new[i]
                        if obj != "":
                            all_result.append([sub, relationWord, obj])
                            raw_sentence.append(RawSentence)

        return all_result, raw_sentence

    def hasEntity(self, word, allEntity):
        for entity in allEntity:
            if entity in word:
                # print(entity)
                return True
        return False

    def PostProcessSPO(self, rawSentence, allTripes, allEntity):
        output_list = []
        for i in range(0, len(allTripes)):
            tripe = allTripes[i]
            sub = tripe[0]
            obj = tripe[2]
            print(sub)
            print(obj)
            if self.hasEntity(sub, allEntity) and self.hasEntity(
                    obj, allEntity):
                # print(sub)
                # print(obj)
                output_list.append(tripe)
        return output_list
Esempio n. 33
0
sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
LTP_DATA_DIR = './ltp_data_v3.4.0'  # ltp模型目录的路径
par_model_path = os.path.join(LTP_DATA_DIR,
                              'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词
pos_model_path = os.path.join(LTP_DATA_DIR,
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
# srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model')  # 语义角色标注模型目录路径

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型

# labeller = SementicRoleLabeller() # 初始化实例
# labeller.load(srl_model_path)  # 加载模型


def cht_to_chs(line):  # 转换繁体到简体
    line = Converter('zh-hans').convert(line)
    line.encode('utf-8')
    return line


def data_prepare(sentences, labA, labT, labD):  # 获取人工标注
    wordList = []
    labelList = []