def role_label(self, words, postags, arcs): """ 语义角色标注 :param words: :param postags: :param arcs: :return: """ srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "{0}:({1},{2})".format(arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() return "roles{}".format(roles)
def semantic_role_label(self): #依存句法分析 parser = Parser() parser.load('ltp_data/parser.model') arcs = parser.parse(self.words, self.postags) parser.release() labeller = SementicRoleLabeller() labeller.load('ltp_data/srl') roles = labeller.label(self.words, self.postags, self.netags, arcs) Label_AX = [] #存放A0或者A1标签的列表 for role in roles: Label_AX.extend([ arg for arg in role.arguments if arg.name == "A0" or arg.name == "A1" ]) for label in Label_AX: #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者 if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10: for i in range(label.range.start, label.range.end + 1): #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体 if self.postags[i] == "n" or self.postags[ i] == "ns" or self.postags[ i] == "nh" or self.postags[i] == "ni": self.entity.append(self.words[i]) else: pass else: pass labeller.release()
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def role(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 # labeller.load('/usr/local/src/ltp_data/srl') # 加载模型 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 """ #arg.name 表示语义角色关系 #arg.range.start 表示起始词位置 #arg.range.end 表示结束位置 roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'} postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'} for role in roles: #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) outstr = "" for arg in role.arguments: block = '' for num in range(arg.range.start, arg.range.end+1): block = block + words[num]+'[%d-%s]'%(num,postags[num]) outstr = outstr + roletype[arg.name] + "(%s);" % block print '%d-%s'%(role.index,words[role.index])+ ":"+outstr """ labeller.release() # 释放模型 return roles
class LtpParser: def __init__(self): LTP_DIR = "./ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 #for role in roles: # print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 model = "srl" labeller.load(os.path.join(modelPath, model)) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) labeller.release() # 释放模型
def yuyijuese(words, postags, netags, arcs): """语义角色标注 """ labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs): # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 # windos下开发使用pisrl_win.model模型 srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('../ltp_data/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 # print '----------------' # for role in roles: # print role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) # print '----------------' labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(os.path.join(LTP_DATA_DIR, 'srl')) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def SrlFunction(contents): from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, 'E:\\ltp_data_v3.4.0\\personal_seg.txt') words = segmentor.segment(contents) # 分词 k = 1 for word in words: print(word + str(k) + ' ', end='') k = k + 1 print('\n') # print('\t'.join(words)) segmentor.release() # 释放模型 wordslist = list(words) from pyltp import Postagger postagger = Postagger() # postagger.load(pos_model_path) postagger.load_with_lexicon(pos_model_path, 'D:\\ltp_data_v3.4.0\\personal_pos.txt') postags = postagger.postag(wordslist) print('\t'.join(postags)) postagger.release() # wordslist = ['人力资源社会保障局','主管','医疗保险','工作'] # postags = ['n','v','n','v'] from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(wordslist, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(wordslist, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_roles_by_pyltp(self, words_list, postags_list, arcs_list): roles_list = list() # 语义角色标注模型路径,模型名称为‘pisrl.model’ srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model") labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(words_list, postags_list, arcs_list) labeller.release() # 尝试释放内存 # import gc # del labeller # gc.collect() # 算了,这个不行 roles_list = list(roles) return roles_list
def get_role_list(self, words, postags): parser = Parser() parser.load(Dependency.par_model) rolelabel = SementicRoleLabeller() rolelabel.load(Dependency.pisrl_model) try: parsers = parser.parse(words, postags) roles = rolelabel.label(words, postags, parsers) except Exception as e: roles = [[]] finally: parser.release() rolelabel.release() return roles
def srl(words, postags, arcs): global labeller if labeller is None: srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 role_list = [] for role in roles: for arg in role.arguments: args = (role.index, arg.name, arg.range.start, arg.range.end) role_list.append(args) return role_list
def get_srl(sentence): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 words = list(pyltp_cut(sentence)) # pyltp分词 postags = list(postagger.postag(words)) # 词性标注 arcs = get_parsing(sentence) # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_srl(self, words): # 语义角色标注 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(self.srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 postags = self.get_postags(words) arcs = self.get_dependency(words) roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型 return roles
def sentence_label(parse_result): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 i = 0 final_result = [] for key, value in parse_result.items(): i += 1 if i % 50 == 0: print('休息一下') time.sleep(5) words = value[0] postags = value[1] arcs = value[2] roles = labeller.label(words, postags, arcs) print('done') print(final_result) labeller.release()
def labeller(word_tag, arcs, srl_model_path): ''' Desc: 语义角色标注 Args: word_tag(dict) 词性词典 arcs 依存关系 srl_model_path 语义角色标注模型 ''' labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(list(word_tag.keys()), list(word_tag.values()), arcs) for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release()
class LtpParser(object): def __init__(self, data_dir: str): self.segmentor = Segmentor() self.segmentor.load(os.path.join(data_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(data_dir, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(data_dir, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(data_dir, "parser.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(data_dir, "pisrl.model")) def parse(self, text: str) -> List[str]: tokens = self.segmentor.segment(text) postags = self.postagger.postag(tokens) netags = self.recognizer.recognize(tokens, postags) arcs = self.parser.parse(tokens, postags) roles = self.labeller.label(tokens, postags, arcs) srlabels = {} for role in roles: srlabels[role.index] = { arg.name: { "start": arg.range.start, "end": arg.range.end } for arg in role.arguments } return { "tokens": list(tokens), "postags": list(postags), "netags": list(netags), "srlabels": srlabels, } def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 #postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) #labeller.load("/home/yjliu/ltp/model/srl/") roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()
class Extractor(): def __init__(self): self.__clause_list = [] self.__subclause_dict = {} self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__labeller = SementicRoleLabeller() self.__words_full_list = [] self.__netags_full_list = [] @property def clause_list(self): return self.__clause_list @property def triple_list(self): return self.__triple_list def split(self, words, postags): start = 0 for j, w in enumerate(words): if w == ',' or w == ',' or w == '。': clause = Clause(start, j-1 ) self.__clause_list.append(clause) start = j + 1 for clause in self.__clause_list: clause.split(postags) for subclause in clause.sub_clause_list: self.add_inverted_idx(subclause) def add_inverted_idx(self, subclause): for i in range(subclause.start_idx, subclause.end_idx): self.__subclause_dict[i] = subclause def load(self): self.__segmentor.load('ltp_data/cws.model') self.__postagger.load('ltp_data/pos.model') self.__recognizer.load('ltp_data/ner.model') self.__parser.load('ltp_data/parser.model') self.__labeller.load('ltp_data/srl') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() self.__labeller.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def resolve_conference(self, entity): try: e_str = entity.get_content_as_str() except Exception: return '?' ref = e_str if e_str == '他' or e_str == '她': for i in range(entity.loc, -1, -1): if self.__netags_full_list[i].lower().endswith('nh'): ref = self.__words_full_list[i] break return ref def resolve_all_conference(self): for t in self.triple_list: e_str = self.resolve_conference(t.entity_1) try: t.entity_1.content = e_str.split() except Exception: pass def chunk_str(self, data): sents = SentenceSplitter.split(data) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) roles = self.__labeller.label(words, postags, netags, arcs) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) except Exception as e: print(str(e)) pass def chunk_sent(self, words, postags, arcs, offset): root = [i for i,x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO'] relations.insert(0,root) prev_e1 = None e1 = None for rel in relations: left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV'] if len(left_arc) > 1: pass #raise Exception('More than 1 left arc is detected!') elif len(left_arc) == 0: e1 = prev_e1 elif len(left_arc) == 1: left_arc = left_arc[0] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost) prev_e1 = e1 right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB'] e2_list = [] if not right_arc: e2 = Entity(2, None) e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO'] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost) else: p1 = range(leftmost, right_arc[0]) p2 = range(item, find_farthest_vob(arcs, item) + 1) e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) r = Relation(words[rel]) t = Triple(e1, e2, r) self.__triple_list.append(t) count += 1
# # print('看这里看这里!!!!:',roles) # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) # labeller.release() # 释放模型 # # words=['天安门','国旗','张思思','优秀'] # postags=posttagger(['天安门','国旗','张思思','优秀']) # netags=ner(words, postags) # arcs=parse(words, postags) # print("---*---"*10) # role_label(words, postags, netags, arcs) #------------------------------------------------- # 没有找到对应版本的 srl 模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(r'D:\Corpus\ltp-models_full\3.2.0\submodels\srl\srl') # 加载模型 words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] # arcs 使用依存句法分析的结果 arcs = parse(words, postags) roles = labeller.label(words, postags, arcs) # 语义角色标注 print('roles====', roles) # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
class SentenceParser: def __init__(self): # LTP_DIR = './ltp_data_v3.4.0' print("加载模型路径", LTP_DIR) self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print("加载完毕") '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典''' ''' 句法分析中,每个只有一个入度(可能吧),可能有多个出度。 为了可以结构化的展示分析结果,或者说方便提取信息。 对每个词建立一个子节点的字典: 1) 若该词的出度为0,字典为NULL 2) 若该词的出度为n,那字典的元素个数为n ''' def build_parse_child_dict(self, words, postags, arcs): """ 格式化句法分析结果 :param words: 分词结果 :param postags: 词性标注结果 :param arcs: 句法分析结果 :return: child_dict_list, format_parse_list """ ''' arcs是一个列表: 列表元素当前单词,每个元素arc包含arc.head, arc.relation信息, head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系 *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息 LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个 返回: child_dict_list:是表示每个词的出度信息,就是子节点信息 format_parse_list:每个词信息格式化: 与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性 ''' child_dict_list = [] format_parse_list = [] # 对每个词建立子节点信息 for index in range(len(words)): child_dict = dict() ## 遍历寻找该词的子节点 for arc_index in range(len(arcs)): ## 如果有指向该词的子节点,则加入child_dict if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) # 对每个词建立指定信息 ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS] # 还可以加上词的NER信息 rely_id = [arc.head for arc in arcs] # 提取每个词依存父节点id(其中id为0的是Root) relation = [arc.relation for arc in arcs] # 提取每个词依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''语义角色标注''' ''' 只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。 ''' def format_labelrole(self, words, postags): """ 格式化语义角色标注结果 :param words: :param postags: :return: """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} ''' roles中有多个role,每个role代表句子中的一个谓词 role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词) arg.name 表示语义角色类型, arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始) arg.range.end 表示该语义角色结束词位置的索引。 roles={ 'r1':{ 'args1':{ 'name': 语义角色类型, 'range':{ 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2':{ 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, 'r2':{ 'args1': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, ... } ''' for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict def close(self): """关闭与释放模型""" self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release() '''parser主函数''' ''' 将模型的输出进行处理,方便之后数据处理 模型输出:words, postags, ners, arcs, roles 处理后信息: child_dict_list:句法分析,每个词的子节点信息 format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一) roles_dic: ''' def parser_main(self, sentence): '''words, postags, ners, arcs 为LTP模型输出''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) ners = list(self.recognizer.recognize(words, postags)) arcs = self.parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) """ arcs中有多个arc arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3… arc.relation 表示依存弧的关系。 注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度) """ child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, ners, child_dict_list, format_parse_list, roles_dict
class LtpParser: def __init__(self): LTP_DIR = "../../res/ltp/ltp_data_v3.4.0" LTP_DIR_USER = "******" self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict def build_parse_child_dict_two(self, words, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) # if child_dict.has_key('SBV'): # print words[index],child_dict['SBV'] child_dict_list.append(child_dict) return child_dict_list '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): # print(words, postags, "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: # arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) parse_child_dict = self.build_parse_child_dict_two(words, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict '''parser主函数''' def parser_main_two(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) # 命名实体识别,主要是hi识别一些人名,地名,机构名等。 netags = self.recognizer.recognize(words, postags) # 格式化数据 child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) # 语义角色 roles_dict = self.format_labelrole(words, postags) return words, postags, netags, arcs, child_dict_list, format_parse_list, roles_dict
class myLTP: def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'): self.LTP_DATA_DIR = LTP_DATA_DIR self.ne_pattern = self._read_ne_pattern(pattern_dir) def _read_ne_pattern(self, filename): ne_pattern = [] with open(filename, encoding='utf8') as filein: for line in filein: if line[0] != '#': np = line.split()[:2] ne_pattern.append(np) return ne_pattern def find_ne_by_pattern(self, text): ne_dic = defaultdict(list) for ne_type, pattern in self.ne_pattern: nes = re.findall(pattern, text) text = re.sub(pattern, ne_type, text) ne_dic[ne_type].extend(nes) return text, ne_dic def load(self, index=[1, 1, 1, 1, 1]): """分词 词性标注 命名实体识别 句法分析 语义角色分析""" if index[0]: cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) if index[1]: pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model_path) if index[2]: ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) if index[3]: par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) if index[4]: srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model') self.labeller = SementicRoleLabeller() self.labeller.load(srl_model_path) def release(self): try: self.segmentor.release() except: pass try: self.postagger.release() except: pass try: self.recognizer.release() except: pass try: self.parser.release() except: pass try: self.labeller.release() except: pass def split_sentence(self, text): """分句""" return SentenceSplitter.split(text) def word_segment(self, sentence): """使用结巴分词""" # words = self.segmentor.segment(sentence) words = jieba.cut(sentence) return list(words) def pos_tag(self, words): """词性标注""" postags = self.postagger.postag(words) return postags def named_entity_recognize(self, words, postags): """命名实体识别""" netags = self.recognizer.recognize(words, postags) return netags def parse(self, words, postags): """句法分析""" arcs = self.parser.parse(words, postags) # (arc.head, arc.relation) return arcs def sementic_role_label(self, words, postags, arcs): """语义角色分析""" roles = self.labeller.label(words, postags, arcs) return roles def _get_ne_for_sentence(self, sentence): """获取实体,包括通过正则表达式定义的一些实体""" sentence, ne_dic = self.find_ne_by_pattern(sentence) words = list(self.word_segment(sentence)) postags = self.postagger.postag(words) ners = self.named_entity_recognize(words, postags) res = {} res['words'] = words res['ners'] = [] for index, ner in enumerate(ners): if ner != 'O': if ner[0] in ('S', 'B'): res['ners'].append([ner[2:], index, index + 1]) else: res['ners'][-1][-1] += 1 for ner_type, v in ne_dic.items(): v = iter(v) if v: for index, word in enumerate(words): if word == ner_type: words[index] = v.__next__() res['ners'].append([ner_type, index, index + 1]) return res def _get_dne_for_sentence(self, sentence): res = [] s = self._get_ne_for_sentence(sentence) ners = s['ners'] words = s['words'] for entity1, entity2 in combinations(ners, 2): res.append((entity1, entity2, words)) return res def get_dne(self, text): """获取实体对,人名(Nh)地名(Ns)机构名(Ni)""" res = [] sentences = self.split_sentence(text) for sentence in sentences: r = self._get_dne_for_sentence(sentence) res.extend(r) return res
class ltp_api(object): def __init__(self, MODELDIR, exword_path=None): self.MODELDIR = MODELDIR self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: '/data1/research/matt/ltp/exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) # 语义角色 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) # 分词 def ltp_segmentor(self, sentence): words = self.segmentor.segment(sentence) return words # 词性标注 def ltp_postagger(self, words): postags = self.postagger.postag(words) return postags # 依存语法 def ltp_parser(self, words, postags): arcs = self.parser.parse(words, postags) return arcs # 命名实体识别 def ltp_recognizer(self, words, postags): netags = self.recognizer.recognize(words, postags) return netags # 语义角色识别 def ltp_labeller(self, words, postags, arcs): output = [] roles = self.labeller.label(words, postags, arcs) for role in roles: output.append([(role.index, arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) return output def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_result(self, sentence): self.words = self.ltp_segmentor(sentence) self.postags = self.ltp_postagger(self.words) self.arcs = self.ltp_parser(self.words, self.postags) self.netags = self.ltp_recognizer(self.words, self.postags) self.output['role'] = self.ltp_labeller(self.words, self.postags, self.arcs) # 载入output self.output['words'] = list(self.words) self.output['postags'] = list(self.postags) self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs] self.output['netags'] = list(self.netags)
class LtpParser: def __init__(self): #initialize every ltp tool LTP_DIR = "/home/demo1/support_ltp" #分词器 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) #依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) #语义角色标注模块 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): #依赖于词性的标注,做依存句法的分析 #解释: #依存句法分析是基于词性标注的。 arcs = self.parser.parse(words, postags) #根据依存句法的分析,标注语义角色 roles = self.labeller.label(words, postags, arcs) #以字典储存,key为编号,value为列表 #而且是嵌套字典,以arg.name作为key #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } print(roles_dict) return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): #其数据结构是: #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子 child_dict_list = [] #这个list的意义就是展示每个词的依存关系 format_parse_list = [] #一级循环:对每个词分析 for index in range(len(words)): #预设孩子字典 child_dict = dict() #二级循环:查每个词的语义角色 for arc_index in range(len(arcs)): #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下 if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): '''显然这是一个类的主函数''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list