def do_rolelabel(self, words, postags, netags, arcs): if self.labeller is None: self.labeller = pyltp.SementicRoleLabeller() if self.debug: load_start = default_timer() self.labeller.load(os.path.join(self.model_dir, 'srl')) if self.debug: load_use = default_timer() - load_start self.loger.debug("load srl model use [ %f ] s" % load_use) roles = self.labeller.label(words, postags, netags, arcs) return roles
def __init__(self): self.path = 'ltp_data_v3.4.0/' # 下载地址 https://ltp.ai/download.html 3.4.0 self.segmentor = pp.Segmentor() self.segmentor.load(self.path + "cws.model") # 加载分词模型 self.postagger = pp.Postagger() self.postagger.load(self.path + "pos.model") # 加载词性标注模型 self.recognizer = pp.NamedEntityRecognizer() self.recognizer.load(self.path + "ner.model") # 加载命名实体识别模型 self.parser = pp.Parser() self.parser.load(self.path + "parser.model") # 加载依存句法分析模型 self.labeller = pp.SementicRoleLabeller() self.labeller.load(self.path + "pisrl.model") # 加载语义角色标注模型
def ltp_process(sentence): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./cws.model") words = segmentor.segment(sentence) print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./parser.model") arcs = parser.parse(words, postags) parser.release() # 角色分析,暂时没用上 # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 for arc in arcs: #SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 Subject_label_set.append( words[Index_of_Subjet]) # 如果有SBV,那么这个词对应的位置肯定是主语 Word_of_speech_content.append( words[arc.head:]) # 拿出来的相当于SBV主语词以后的部分。 Index_of_Subjet += 1 else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 ''' recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./ner.model") netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
def ltp_process(sentence, old_SI={}): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./model/cws.model") words = segmentor.segment(sentence) #print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./model/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 #print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./model/parser.model") arcs = parser.parse(words, postags) parser.release() # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./model/pisrl.model") roles = labeller.label(words, postags, arcs) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 依存句法分析 noun_tags = ['nh', 'nd', 'n', 'ni', 'nl', 'ns', 'nt', 'nz'] SI_words = {} # 词和索引 for tag in noun_tags: SI_index = np.argwhere(np.array(postags) == tag).reshape(-1).tolist() for j in SI_index: SI_words[words[j]] = j #print(SI_words) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 SI_postags = {} si_SBV_postag = [] si_VOB_postag = [] for arc in arcs: # SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 #print(arc.head, words[arc.head -1]) SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 # 加入主语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样 if old_SI: ag2entity = np.argmax(old_SI.params.keys()) words[Index_of_Subjet] = list( old_SI.params.keys())[ag2entity] else: pass Subject_label_set.append(words[Index_of_Subjet]) else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet if postags[arc.head - 1] == 'v': si_SBV_postag.append( (words[Index_of_Subjet], Index_of_Subjet)) Word_of_speech_content.append( intro_speech(''.join(words[arc.head:]))) # 拿出所说内容。 #print(intro_speech(''.join(words[arc.head:]))) Index_of_Subjet += 1 SI_postags[arc.relation] = si_SBV_postag elif arc.relation == 'VOB' and words[arc.head - 1] not in stop_words: # 加入宾语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 引入前一句的宾语位置和积分最高元素 pass else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Index_of_Subjet += 1 SI_postags[arc.relation] = si_VOB_postag else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 Forcus_point = Si(SI_words, SI_postags, old_SI) # 关注焦点集 # 需要更新self.params Forcus_point.score() recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./model/ner.model") netags = recognizer.recognize(words, postags) #print("\t".join(netags)) ''' labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
def srl(words, postags, arcs): global srl_ if srl_ is None: srl_ = pyltp.SementicRoleLabeller() srl_.load(ltp_models['pisrl_win']) return srl_.label(words, postags, arcs)