Ejemplo n.º 1
0
    def do_rolelabel(self, words, postags, netags, arcs):
        if self.labeller is None:
            self.labeller = pyltp.SementicRoleLabeller()
            if self.debug:
                load_start = default_timer()
            self.labeller.load(os.path.join(self.model_dir, 'srl'))
            if self.debug:
                load_use = default_timer() - load_start
                self.loger.debug("load srl model use [ %f ] s" % load_use)

        roles = self.labeller.label(words, postags, netags, arcs)

        return roles
Ejemplo n.º 2
0
    def __init__(self):
        self.path = 'ltp_data_v3.4.0/'  # 下载地址 https://ltp.ai/download.html 3.4.0
        self.segmentor = pp.Segmentor()
        self.segmentor.load(self.path + "cws.model")  # 加载分词模型

        self.postagger = pp.Postagger()
        self.postagger.load(self.path + "pos.model")  # 加载词性标注模型

        self.recognizer = pp.NamedEntityRecognizer()
        self.recognizer.load(self.path + "ner.model")  # 加载命名实体识别模型

        self.parser = pp.Parser()
        self.parser.load(self.path + "parser.model")  # 加载依存句法分析模型

        self.labeller = pp.SementicRoleLabeller()
        self.labeller.load(self.path + "pisrl.model")  # 加载语义角色标注模型
Ejemplo n.º 3
0
def ltp_process(sentence):
    stop_words = get_stops_words()  # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。

    # 分词
    segmentor = pyltp.Segmentor()
    segmentor.load("./cws.model")
    words = segmentor.segment(sentence)
    print("\t".join(words))
    segmentor.release()

    # 词性
    postagger = pyltp.Postagger()
    postagger.load("./pos.model")
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    print("\t".join(postags))
    postagger.release()

    # 依存句法分析
    parser = pyltp.Parser()
    parser.load("./parser.model")
    arcs = parser.parse(words, postags)
    parser.release()

    # 角色分析,暂时没用上
    # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

    SBV_set = list()
    Subject_label_set = list()
    Word_of_speech_content = list()

    Index_of_Subjet = 0

    for arc in arcs:
        #SBV_index = get_repeat(arc.head, "SBV")
        k = Index_of_Subjet
        if arc.relation == "SBV" and words[
                arc.head -
                1] not in stop_words:  # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0

            SBV_set.append(words[arc.head - 1])  # arc.head是从1开始计数,存储SBV指向的谓语动词
            Subject_label_set.append(
                words[Index_of_Subjet])  # 如果有SBV,那么这个词对应的位置肯定是主语
            Word_of_speech_content.append(
                words[arc.head:])  # 拿出来的相当于SBV主语词以后的部分。

            Index_of_Subjet += 1

        else:
            Index_of_Subjet += 1
            continue  # 如果为空列表,该句子没有分析的必要性
    '''
    recognizer = pyltp.NamedEntityRecognizer()
    recognizer.load("./ner.model")
    netags = recognizer.recognize(words, postags)
    print("\t".join(netags))
    
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    for role in roles:
        print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    '''

    return SBV_set, Subject_label_set, Word_of_speech_content  # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
Ejemplo n.º 4
0
def ltp_process(sentence, old_SI={}):

    stop_words = get_stops_words()  # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。

    # 分词
    segmentor = pyltp.Segmentor()
    segmentor.load("./model/cws.model")
    words = segmentor.segment(sentence)
    #print("\t".join(words))
    segmentor.release()

    # 词性
    postagger = pyltp.Postagger()
    postagger.load("./model/pos.model")
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    #print("\t".join(postags))
    postagger.release()

    # 依存句法分析
    parser = pyltp.Parser()
    parser.load("./model/parser.model")
    arcs = parser.parse(words, postags)
    parser.release()

    # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./model/pisrl.model")
    roles = labeller.label(words, postags, arcs)

    #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 依存句法分析

    noun_tags = ['nh', 'nd', 'n', 'ni', 'nl', 'ns', 'nt', 'nz']
    SI_words = {}  # 词和索引
    for tag in noun_tags:
        SI_index = np.argwhere(np.array(postags) == tag).reshape(-1).tolist()
        for j in SI_index:
            SI_words[words[j]] = j

    #print(SI_words)

    SBV_set = list()
    Subject_label_set = list()
    Word_of_speech_content = list()

    Index_of_Subjet = 0
    SI_postags = {}
    si_SBV_postag = []
    si_VOB_postag = []
    for arc in arcs:
        # SBV_index = get_repeat(arc.head, "SBV")
        k = Index_of_Subjet
        if arc.relation == "SBV" and words[
                arc.head -
                1] not in stop_words:  # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0
            #print(arc.head, words[arc.head -1])
            SBV_set.append(words[arc.head - 1])  # arc.head是从1开始计数,存储SBV指向的谓语动词
            # 加入主语的判断
            if words[Index_of_Subjet] in [
                    '他', '他们', '你', '你们', '我', '我们', '她', '她们'
            ]:
                # 进行指代消解
                # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样
                if old_SI:
                    ag2entity = np.argmax(old_SI.params.keys())

                    words[Index_of_Subjet] = list(
                        old_SI.params.keys())[ag2entity]

                else:
                    pass

                Subject_label_set.append(words[Index_of_Subjet])

            else:
                Subject_label_set.append(
                    words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语
                #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet
                if postags[arc.head - 1] == 'v':
                    si_SBV_postag.append(
                        (words[Index_of_Subjet], Index_of_Subjet))

            Word_of_speech_content.append(
                intro_speech(''.join(words[arc.head:])))  # 拿出所说内容。
            #print(intro_speech(''.join(words[arc.head:])))
            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_SBV_postag

        elif arc.relation == 'VOB' and words[arc.head - 1] not in stop_words:
            # 加入宾语的判断
            if words[Index_of_Subjet] in [
                    '他', '他们', '你', '你们', '我', '我们', '她', '她们'
            ]:
                # 进行指代消解
                # 引入前一句的宾语位置和积分最高元素
                pass

            else:
                Subject_label_set.append(
                    words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语

                si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet))

            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_VOB_postag

        else:
            Index_of_Subjet += 1
            continue  # 如果为空列表,该句子没有分析的必要性

    Forcus_point = Si(SI_words, SI_postags, old_SI)  # 关注焦点集
    # 需要更新self.params
    Forcus_point.score()

    recognizer = pyltp.NamedEntityRecognizer()
    recognizer.load("./model/ner.model")
    netags = recognizer.recognize(words, postags)
    #print("\t".join(netags))
    '''
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    for role in roles:
        print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    '''
    return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point  # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
Ejemplo n.º 5
0
def srl(words, postags, arcs):
    global srl_
    if srl_ is None:
        srl_ = pyltp.SementicRoleLabeller()
        srl_.load(ltp_models['pisrl_win'])
    return srl_.label(words, postags, arcs)