Beispiel #1
0
    def role_label(self, words, postags, arcs):
        """
        语义角色标注
        :param words:
        :param postags:
        :param arcs:
        :return:
        """
        srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model')

        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(srl_model)  # 加载模型

        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        for role in roles:
            print(
                role.index, "".join([
                    "{0}:({1},{2})".format(arg.name, arg.range.start,
                                           arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()

        return "roles{}".format(roles)
Beispiel #2
0
    def semantic_role_label(self):
        #依存句法分析
        parser = Parser()
        parser.load('ltp_data/parser.model')
        arcs = parser.parse(self.words, self.postags)
        parser.release()

        labeller = SementicRoleLabeller()
        labeller.load('ltp_data/srl')
        roles = labeller.label(self.words, self.postags, self.netags, arcs)

        Label_AX = []  #存放A0或者A1标签的列表
        for role in roles:
            Label_AX.extend([
                arg for arg in role.arguments
                if arg.name == "A0" or arg.name == "A1"
            ])
        for label in Label_AX:
            #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者
            if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10:
                for i in range(label.range.start, label.range.end + 1):
                    #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体
                    if self.postags[i] == "n" or self.postags[
                            i] == "ns" or self.postags[
                                i] == "nh" or self.postags[i] == "ni":
                        self.entity.append(self.words[i])
                    else:
                        pass
            else:
                pass
        labeller.release()
Beispiel #3
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Beispiel #4
0
def role(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    # labeller.load('/usr/local/src/ltp_data/srl')  # 加载模型
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    """
    #arg.name 表示语义角色关系
    #arg.range.start 表示起始词位置
    #arg.range.end 表示结束位置
    roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'}
    postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'}
    for role in roles:
        #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

        outstr = ""
        for arg in role.arguments:
            block = ''

            for num in range(arg.range.start, arg.range.end+1):
                block = block + words[num]+'[%d-%s]'%(num,postags[num])
            outstr = outstr + roletype[arg.name] + "(%s);" % block
        print '%d-%s'%(role.index,words[role.index])+ ":"+outstr
    """
    labeller.release()  # 释放模型
    return roles
class LtpParser:
    def __init__(self):
        LTP_DIR = "./ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''
    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Beispiel #6
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print (role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    #for role in roles:
    #   print (role.index, "".join(   ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
    return roles
Beispiel #8
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    model = "srl"
    labeller.load(os.path.join(modelPath, model))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    labeller.release()  # 释放模型
Beispiel #9
0
def yuyijuese(words, postags, netags, arcs):
    """语义角色标注  """
    labeller = SementicRoleLabeller()
    labeller.load(os.path.join(MODELDIR, "srl/"))
    roles = labeller.label(words, postags, netags, arcs)

    for role in roles:
        print (role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs):
    # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
    # windos下开发使用pisrl_win.model模型
    srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model')
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    labeller.release()  # 释放模型
    return roles
Beispiel #11
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Beispiel #12
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('../ltp_data/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    # print '----------------'
    # for role in roles:
    #     print role.index, "".join(
    #         ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    # print '----------------'
    labeller.release()  # 释放模型
    return roles
Beispiel #13
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(os.path.join(LTP_DATA_DIR, 'srl'))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Beispiel #14
0
def SrlFunction(contents):
    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(cws_model_path,
                                'E:\\ltp_data_v3.4.0\\personal_seg.txt')
    words = segmentor.segment(contents)  # 分词
    k = 1
    for word in words:
        print(word + str(k) + '  ', end='')
        k = k + 1
    print('\n')
    # print('\t'.join(words))
    segmentor.release()  # 释放模型
    wordslist = list(words)

    from pyltp import Postagger
    postagger = Postagger()
    # postagger.load(pos_model_path)
    postagger.load_with_lexicon(pos_model_path,
                                'D:\\ltp_data_v3.4.0\\personal_pos.txt')
    postags = postagger.postag(wordslist)
    print('\t'.join(postags))
    postagger.release()

    # wordslist = ['人力资源社会保障局','主管','医疗保险','工作']
    # postags = ['n','v','n','v']

    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(wordslist, postags)  # 句法分析
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型

    from pyltp import SementicRoleLabeller
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(wordslist, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Beispiel #15
0
 def get_roles_by_pyltp(self, words_list, postags_list, arcs_list):
     roles_list = list()
     # 语义角色标注模型路径,模型名称为‘pisrl.model’
     srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model")
     labeller = SementicRoleLabeller()
     labeller.load(srl_model_path)
     roles = labeller.label(words_list, postags_list, arcs_list)
     labeller.release()
     # 尝试释放内存
     # import gc
     # del labeller
     # gc.collect()
     # 算了,这个不行
     roles_list = list(roles)
     return roles_list
Beispiel #16
0
    def get_role_list(self, words, postags):
        parser = Parser()
        parser.load(Dependency.par_model)

        rolelabel = SementicRoleLabeller()
        rolelabel.load(Dependency.pisrl_model)
        try:
            parsers = parser.parse(words, postags)
            roles = rolelabel.label(words, postags, parsers)
        except Exception as e:
            roles = [[]]
        finally:
            parser.release()
            rolelabel.release()
            return roles
Beispiel #17
0
def srl(words, postags, arcs):
    global labeller
    if labeller is None:
        srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')  # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
        labeller = SementicRoleLabeller() # 初始化实例
        labeller.load(srl_model_path)  # 加载模型

    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注

    # 打印结果
    role_list = []
    for role in roles:
        for arg in role.arguments:
            args = (role.index, arg.name, arg.range.start, arg.range.end)
            role_list.append(args)
    return role_list
def get_srl(sentence):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    words = list(pyltp_cut(sentence))  # pyltp分词
    postags = list(postagger.postag(words))  # 词性标注
    arcs = get_parsing(sentence)
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
        labeller.release()  # 释放模型
Beispiel #19
0
    def get_srl(self, words):
        # 语义角色标注
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(self.srl_model_path)  # 加载模型
        # arcs 使用依存句法分析的结果
        postags = self.get_postags(words)
        arcs = self.get_dependency(words)
        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        # 打印结果
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()  # 释放模型
        return roles
def sentence_label(parse_result):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    i = 0
    final_result = []

    for key, value in parse_result.items():
        i += 1
        if i % 50 == 0:
            print('休息一下')
            time.sleep(5)
        words = value[0]
        postags = value[1]
        arcs = value[2]
        roles = labeller.label(words, postags, arcs)

    print('done')
    print(final_result)
    labeller.release()
Beispiel #21
0
def labeller(word_tag, arcs, srl_model_path):
    '''
    Desc: 语义角色标注
    Args: word_tag(dict) 词性词典
          arcs 依存关系
          srl_model_path 语义角色标注模型
    '''

    labeller = SementicRoleLabeller()
    labeller.load(srl_model_path)
    roles = labeller.label(list(word_tag.keys()), list(word_tag.values()),
                           arcs)
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()
Beispiel #22
0
class LtpParser(object):
    def __init__(self, data_dir: str):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(data_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(data_dir, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(data_dir, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(data_dir, "parser.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(data_dir, "pisrl.model"))

    def parse(self, text: str) -> List[str]:
        tokens = self.segmentor.segment(text)
        postags = self.postagger.postag(tokens)
        netags = self.recognizer.recognize(tokens, postags)
        arcs = self.parser.parse(tokens, postags)
        roles = self.labeller.label(tokens, postags, arcs)
        srlabels = {}
        for role in roles:
            srlabels[role.index] = {
                arg.name: {
                    "start": arg.range.start,
                    "end": arg.range.end
                }
                for arg in role.arguments
            }
        return {
            "tokens": list(tokens),
            "postags": list(postags),
            "netags": list(netags),
            "srlabels": srlabels,
        }

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Beispiel #23
0
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
#postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
#labeller.load("/home/yjliu/ltp/model/srl/")
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join([
        "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
        for arg in role.arguments
    ])
Beispiel #24
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()
Beispiel #25
0
class Extractor():

    def __init__(self):
        self.__clause_list = []
        self.__subclause_dict = {}
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__labeller = SementicRoleLabeller()
        self.__words_full_list = []
        self.__netags_full_list = []

    @property
    def clause_list(self):
        return self.__clause_list

    @property
    def triple_list(self):
        return self.__triple_list


    def split(self, words, postags):
        start = 0
        for j, w in enumerate(words):
            if w == ',' or w == ',' or w == '。':
                clause = Clause(start, j-1 )
                self.__clause_list.append(clause)
                start = j + 1

        for clause in self.__clause_list:
            clause.split(postags)
            for subclause in clause.sub_clause_list:
                self.add_inverted_idx(subclause)

    def add_inverted_idx(self, subclause):
        for i in range(subclause.start_idx, subclause.end_idx):
            self.__subclause_dict[i] = subclause

    def load(self):
        self.__segmentor.load('ltp_data/cws.model')
        self.__postagger.load('ltp_data/pos.model')
        self.__recognizer.load('ltp_data/ner.model')
        self.__parser.load('ltp_data/parser.model')
        self.__labeller.load('ltp_data/srl')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()
        self.__labeller.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []
    
    def resolve_conference(self, entity):
        try:
            e_str = entity.get_content_as_str()
        except Exception:
            return '?'
        ref = e_str
        if e_str == '他' or e_str == '她':
            for i in range(entity.loc, -1, -1):
                if self.__netags_full_list[i].lower().endswith('nh'):
                    ref = self.__words_full_list[i]
                    break
        return ref
    
    def resolve_all_conference(self):
        for t in self.triple_list:
            e_str = self.resolve_conference(t.entity_1)
            try:
                t.entity_1.content = e_str.split()
            except Exception:
                pass

    def chunk_str(self, data):
        sents = SentenceSplitter.split(data)
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                roles = self.__labeller.label(words, postags, netags, arcs)
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
            except Exception as e:
                print(str(e))
                pass

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i for i,x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO']
        relations.insert(0,root)

        prev_e1 = None
        e1      = None
        for rel in relations:

            left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV']

            if len(left_arc) > 1:
                pass
                #raise Exception('More than 1 left arc is detected!')
            elif len(left_arc) == 0:
                e1 = prev_e1
            elif len(left_arc) == 1:
                left_arc = left_arc[0]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost)


            prev_e1 = e1

            right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB']

            e2_list = []
            if not right_arc:
                e2 = Entity(2, None)
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO']
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)


                    e2 = None

                    if count == 0:
                        e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost)
                    else:
                        p1 = range(leftmost, right_arc[0])
                        p2 = range(item, find_farthest_vob(arcs, item) + 1)
                        e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    r = Relation(words[rel])
                    t = Triple(e1, e2, r)
                    self.__triple_list.append(t)
                    count += 1
Beispiel #26
0
#     # print('看这里看这里!!!!:',roles)
#     for role in roles:
#         print(role.index, "".join(
#             ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
#     labeller.release()  # 释放模型
#
# words=['天安门','国旗','张思思','优秀']
# postags=posttagger(['天安门','国旗','张思思','优秀'])
# netags=ner(words, postags)
# arcs=parse(words, postags)
# print("---*---"*10)
# role_label(words, postags, netags, arcs)
#-------------------------------------------------
# 没有找到对应版本的 srl 模型
labeller = SementicRoleLabeller()  # 初始化实例
labeller.load(r'D:\Corpus\ltp-models_full\3.2.0\submodels\srl\srl')  # 加载模型
words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
# arcs 使用依存句法分析的结果
arcs = parse(words, postags)
roles = labeller.label(words, postags, arcs)  # 语义角色标注
print('roles====', roles)
# 打印结果
for role in roles:
    print(
        role.index, "".join([
            "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
            for arg in role.arguments
        ]))
labeller.release()  # 释放模型
class SentenceParser:
    def __init__(self):
        # LTP_DIR = './ltp_data_v3.4.0'
        print("加载模型路径", LTP_DIR)
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
        print("加载完毕")

    '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典'''
    '''
        句法分析中,每个只有一个入度(可能吧),可能有多个出度。
        为了可以结构化的展示分析结果,或者说方便提取信息。
        对每个词建立一个子节点的字典:
            1) 若该词的出度为0,字典为NULL
            2) 若该词的出度为n,那字典的元素个数为n
    '''

    def build_parse_child_dict(self, words, postags, arcs):
        """
        格式化句法分析结果
        :param words: 分词结果
        :param postags: 词性标注结果
        :param arcs: 句法分析结果
        :return: child_dict_list, format_parse_list
        """
        '''
        arcs是一个列表:
            列表元素当前单词,每个元素arc包含arc.head, arc.relation信息,
            head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系
            *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息
            
        LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个
        返回:
            child_dict_list:是表示每个词的出度信息,就是子节点信息
            format_parse_list:每个词信息格式化:  与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性
        '''

        child_dict_list = []
        format_parse_list = []

        # 对每个词建立子节点信息
        for index in range(len(words)):
            child_dict = dict()
            ## 遍历寻找该词的子节点
            for arc_index in range(len(arcs)):
                ## 如果有指向该词的子节点,则加入child_dict
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)

            child_dict_list.append(child_dict)

        # 对每个词建立指定信息
        ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS]  # 还可以加上词的NER信息
        rely_id = [arc.head for arc in arcs]  # 提取每个词依存父节点id(其中id为0的是Root)
        relation = [arc.relation for arc in arcs]  # 提取每个词依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''语义角色标注'''
    '''
        只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。
    '''

    def format_labelrole(self, words, postags):
        """
        格式化语义角色标注结果
        :param words:
        :param postags:
        :return:
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        '''
        roles中有多个role,每个role代表句子中的一个谓词
            role.index 代表谓词的索引, 
            role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词)
                arg.name 表示语义角色类型,
                arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始)
                arg.range.end 表示该语义角色结束词位置的索引。
        roles={
            'r1':{
                'args1':{
                    'name': 语义角色类型,
                    'range':{
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2':{
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            'r2':{
                'args1': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                'args2': {
                    'name': 语义角色类型,
                    'range': {
                        'start': 语义角色起始词位置的索引,
                        'end': 语义角色结束词位置的索引
                    }
                },
                ...
            },
            ...
        }
        '''
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    def close(self):
        """关闭与释放模型"""
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()

    '''parser主函数'''
    '''
    将模型的输出进行处理,方便之后数据处理
        模型输出:words, postags, ners, arcs, roles
        处理后信息:
            child_dict_list:句法分析,每个词的子节点信息
            format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一)
            roles_dic:
    '''

    def parser_main(self, sentence):
        '''words, postags, ners, arcs 为LTP模型输出'''
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        ners = list(self.recognizer.recognize(words, postags))
        arcs = self.parser.parse(words, postags)

        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
        """
        arcs中有多个arc
            arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3…
            arc.relation 表示依存弧的关系。
            注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度)
        """
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)

        return words, postags, ners, child_dict_list, format_parse_list, roles_dict
Beispiel #28
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "../../res/ltp/ltp_data_v3.4.0"
        LTP_DIR_USER = "******"
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt"))
        # self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    def build_parse_child_dict_two(self, words, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            # if child_dict.has_key('SBV'):
            #    print words[index],child_dict['SBV']
            child_dict_list.append(child_dict)
        return child_dict_list

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        # print(words, postags, "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):

                if arcs[arc_index].head == index + 1:  # arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        parse_child_dict = self.build_parse_child_dict_two(words, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict

    '''parser主函数'''

    def parser_main_two(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        # 命名实体识别,主要是hi识别一些人名,地名,机构名等。
        netags = self.recognizer.recognize(words, postags)
        # 格式化数据
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        # 语义角色
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, netags, arcs, child_dict_list, format_parse_list, roles_dict
Beispiel #29
0
class myLTP:
    def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'):
        self.LTP_DATA_DIR = LTP_DATA_DIR
        self.ne_pattern = self._read_ne_pattern(pattern_dir)

    def _read_ne_pattern(self, filename):
        ne_pattern = []
        with open(filename, encoding='utf8') as filein:
            for line in filein:
                if line[0] != '#':
                    np = line.split()[:2]
                    ne_pattern.append(np)
        return ne_pattern

    def find_ne_by_pattern(self, text):
        ne_dic = defaultdict(list)
        for ne_type, pattern in self.ne_pattern:
            nes = re.findall(pattern, text)
            text = re.sub(pattern, ne_type, text)
            ne_dic[ne_type].extend(nes)
        return text, ne_dic

    def load(self, index=[1, 1, 1, 1, 1]):
        """分词 词性标注 命名实体识别 句法分析 语义角色分析"""
        if index[0]:
            cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
            self.segmentor = Segmentor()
            self.segmentor.load(cws_model_path)

        if index[1]:
            pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
            self.postagger = Postagger()
            self.postagger.load(pos_model_path)

        if index[2]:
            ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')
            self.recognizer = NamedEntityRecognizer()
            self.recognizer.load(ner_model_path)

        if index[3]:
            par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model')
            self.parser = Parser()
            self.parser.load(par_model_path)

        if index[4]:
            srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model')
            self.labeller = SementicRoleLabeller()
            self.labeller.load(srl_model_path)

    def release(self):
        try:
            self.segmentor.release()
        except:
            pass
        try:
            self.postagger.release()
        except:
            pass
        try:
            self.recognizer.release()
        except:
            pass
        try:
            self.parser.release()
        except:
            pass
        try:
            self.labeller.release()
        except:
            pass

    def split_sentence(self, text):
        """分句"""
        return SentenceSplitter.split(text)

    def word_segment(self, sentence):
        """使用结巴分词"""
        # words = self.segmentor.segment(sentence)
        words = jieba.cut(sentence)
        return list(words)

    def pos_tag(self, words):
        """词性标注"""
        postags = self.postagger.postag(words)
        return postags

    def named_entity_recognize(self, words, postags):
        """命名实体识别"""
        netags = self.recognizer.recognize(words, postags)
        return netags

    def parse(self, words, postags):
        """句法分析"""
        arcs = self.parser.parse(words, postags)  # (arc.head, arc.relation)
        return arcs

    def sementic_role_label(self, words, postags, arcs):
        """语义角色分析"""
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def _get_ne_for_sentence(self, sentence):
        """获取实体,包括通过正则表达式定义的一些实体"""

        sentence, ne_dic = self.find_ne_by_pattern(sentence)
        words = list(self.word_segment(sentence))
        postags = self.postagger.postag(words)
        ners = self.named_entity_recognize(words, postags)
        res = {}
        res['words'] = words
        res['ners'] = []
        for index, ner in enumerate(ners):
            if ner != 'O':
                if ner[0] in ('S', 'B'):
                    res['ners'].append([ner[2:], index, index + 1])
                else:
                    res['ners'][-1][-1] += 1
        for ner_type, v in ne_dic.items():
            v = iter(v)
            if v:
                for index, word in enumerate(words):
                    if word == ner_type:
                        words[index] = v.__next__()
                        res['ners'].append([ner_type, index, index + 1])
        return res

    def _get_dne_for_sentence(self, sentence):
        res = []
        s = self._get_ne_for_sentence(sentence)
        ners = s['ners']
        words = s['words']
        for entity1, entity2 in combinations(ners, 2):
            res.append((entity1, entity2, words))
        return res

    def get_dne(self, text):
        """获取实体对,人名(Nh)地名(Ns)机构名(Ni)"""
        res = []
        sentences = self.split_sentence(text)
        for sentence in sentences:
            r = self._get_dne_for_sentence(sentence)
            res.extend(r)
        return res
class ltp_api(object):
    def __init__(self, MODELDIR, exword_path=None):
        self.MODELDIR = MODELDIR
        self.output = {}
        self.words = None
        self.postags = None
        self.netags = None
        self.arcs = None
        self.exword_path = exword_path  # e.x: '/data1/research/matt/ltp/exwords.txt'
        # 分词
        self.segmentor = Segmentor()
        if not self.exword_path:
            # 是否加载额外词典
            self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(
                os.path.join(self.MODELDIR, "cws.model"), self.exword_path)

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
        # 依存句法
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model"))
        # 语义角色
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

    # 分词
    def ltp_segmentor(self, sentence):
        words = self.segmentor.segment(sentence)
        return words

    # 词性标注
    def ltp_postagger(self, words):
        postags = self.postagger.postag(words)
        return postags

    # 依存语法
    def ltp_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        return arcs

    # 命名实体识别
    def ltp_recognizer(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        return netags

    # 语义角色识别
    def ltp_labeller(self, words, postags, arcs):
        output = []
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            output.append([(role.index, arg.name, arg.range.start,
                            arg.range.end) for arg in role.arguments])
        return output

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()

    def get_result(self, sentence):
        self.words = self.ltp_segmentor(sentence)
        self.postags = self.ltp_postagger(self.words)
        self.arcs = self.ltp_parser(self.words, self.postags)
        self.netags = self.ltp_recognizer(self.words, self.postags)
        self.output['role'] = self.ltp_labeller(self.words, self.postags,
                                                self.arcs)

        # 载入output
        self.output['words'] = list(self.words)
        self.output['postags'] = list(self.postags)
        self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs]
        self.output['netags'] = list(self.netags)
class LtpParser:
    def __init__(self):

        #initialize every ltp tool
        LTP_DIR = "/home/demo1/support_ltp"

        #分词器
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        #词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        #依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        #命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        #语义角色标注模块
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):

        #依赖于词性的标注,做依存句法的分析
        #解释:
        #依存句法分析是基于词性标注的。
        arcs = self.parser.parse(words, postags)

        #根据依存句法的分析,标注语义角色
        roles = self.labeller.label(words, postags, arcs)

        #以字典储存,key为编号,value为列表
        #而且是嵌套字典,以arg.name作为key
        #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }

        print(roles_dict)
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):

        #其数据结构是:
        #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子
        child_dict_list = []

        #这个list的意义就是展示每个词的依存关系
        format_parse_list = []

        #一级循环:对每个词分析
        for index in range(len(words)):

            #预设孩子字典
            child_dict = dict()

            #二级循环:查每个词的语义角色
            for arc_index in range(len(arcs)):

                #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)

        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        '''显然这是一个类的主函数'''

        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list