def jufa_fenxi(words,postags): """句法分析""" parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
class LtpModel(object): """ 封装pyltp model 类,方便使用 """ @pysnooper.snoop() def __init__(self, LTP_DATA_DIR): """加载pyltp模型""" self.LTP_DATA_DIR = LTP_DATA_DIR # pyltp的存放路径 # 分词模型路径,分词模型名称是 'cws.model' cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) # 词性标注模型路径,分词模型名称是 'pos.model' pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postager = Postagger() self.postager.load(pos_model_path) # 命名实体识别模型路径,模型名称为'ner.model' ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # 依存句法分析模型路径,模型名称为 'parser.model' par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) # # 语义角色标注模型目录路径,模型目录为'pisrl.model' # srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl.model') # self.labeller = SementicRoleLabeller() # 初始化实例 # self.labeller.load(srl_model_path) # 加载模型 def load_model(self): # """加载pyltp模型""" # # 分词模型路径,分词模型名称是‘cws.model’ # self.segment = Segmentor() # print(cws_model_path) # self.segment.load(cws_model_path) # # 词性标注模型路径,分词模型名称是‘pos.model’ # self.postager = Postagger() # self.postager.load(pos_model_path) # # # 命名实体识别模型路径,模型名称为`pos.model` # self.recognizer = NamedEntityRecognizer() # self.recognizer.load(ner_model_path) # # # 依存句法分析模型路径,模型名称为`parser.model` # self.parser = Parser() # self.parser.load(par_model_path) # # # 语义角色标注模型目录路径,模型目录为`srl` # self.labeller = SementicRoleLabeller() # 初始化实例 # self.labeller.load(srl_model_path) # 加载模型 # 加载word2vec 模型 pass @pysnooper.snoop() def release_all_model(self): """释放模型""" self.segmentor.release() self.postager.release() self.recognizer.release() self.parser.release() # word2vec 模型的释放 pass # 分句 @pysnooper.snoop() def split_sentences(self, string): sents = SentenceSplitter.split(string) sentences = [s for s in sents if len(s) != 0] return sentences def jieba_word_cut(self, string): string = re.findall( '[\d|\w|\u3002 |\uff1f |\uff01 |\uff0c |\u3001 |\uff1b |\uff1a |\u201c |\u201d |\u2018 |\u2019 |\uff08 |\uff09 |\u300a |\u300b |\u3008 |\u3009 |\u3010 |\u3011 |\u300e |\u300f |\u300c |\u300d |\ufe43 |\ufe44 |\u3014 |\u3015 |\u2026 |\u2014 |\uff5e |\ufe4f |\uffe5]+', string) string = ' '.join(string) return ' '.join(jieba.cut(string)) # 分词 @pysnooper.snoop() def split_words(self, sentences): sents = [self.jieba_word_cut(s) for s in sentences] return sents # 词性分析 @pysnooper.snoop() def get_word_pos(self, sents): postags = [self.postager.postag(words.split()) for words in sents] postags = [list(w) for w in postags] return postags # 依存句法分析 @pysnooper.snoop() def dependency_parsing(self, sents, postags, said): contents = [] for index in range(len(sents)): wo = sents[index].split() po = postags[index] netags = self.recognizer.recognize(wo, po) # 命名实体识别 netags = list(netags) # print(netags) if ('S-Nh' not in netags) and ('S-Ni' not in netags) and ( 'S-Ns' not in netags): # 人名、机构名、地名 当人名、机构名、地名在该句中则进行依存句法分析 continue arcs = self.parser.parse(wo, po) arcs = [(arc.head, arc.relation) for arc in arcs] # print(arcs) #[(2, 'SBV'), (0, 'HED'), (5, 'SBV'), (5, 'ADV'), (2, 'VOB')] arcs = [(i, arc) for i, arc in enumerate(arcs) if arc[1] == 'SBV'] # SBV 主谓关系 找出主谓关系的句子 # print(arcs) #[(0, (2, 'SBV')), (2, (5, 'SBV'))] for arc in arcs: verb = arc[1][0] # 2 5 subject = arc[0] # 0 1 if wo[verb - 1] not in said: # 如果wo[verb - 1]这个所对应的词语 在已建词表said中,则打印出来 continue # print(wo[subject],wo[verb - 1],''.join(wo[verb:])) contents.append((wo[subject], wo[verb - 1], ''.join(wo[verb:]))) # 依次为人物、"说"的近义词、文本 return contents @pysnooper.snoop() def get_sentences_json_result(self, string): """ 对输入的句子进行SBV提取 :param string: :return: list of dict [{}] """ sentences = self.split_sentences(string) # 分句 sents = self.split_words(sentences) # 分词 postags = self.get_word_pos(sents) # 词性分析 contents = self.dependency_parsing(sents, postags, txt_said) # 依存句法分析 # 拼装json结果 contents_dict = [] for ones in enumerate(contents): # json 字段 result = { 'name': ones[1][0], 'trigger': ones[1][1], 'content': ones[1][2] } contents_dict.append(result) return contents_dict
class Ltp(LtpSegment): __model_dir = os.path.join('source', 'ltp_data_v3.4.0') # 词性标注 postagger = Postagger() postagger.load(os.path.join(__model_dir, "pos.model")) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(__model_dir, "ner.model")) # 依存句法分析 parser = Parser() parser.load(os.path.join(__model_dir, "parser.model")) # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(__model_dir, "pisrl.model")) def __init__(self): super().__init__() def postag(self, words): """ 词性标注 :param input: 分词结果 list :return: 词性 list """ postags = self.postagger.postag(words) return list(postags) def recognize(self, words, postags): """ 命名实体识别: 1. LTP 采用 BIESO 标注体系:B表示实体开始词;I表示实体中间词;E表示实体结束词; S表示单独成实体;O表示不构成命名实体 2. LTP 提供的命名实体类型为:人名(Nh);地名(Ns);机构名(Ni) 3. B、I、E、S位置标签和实体类型标签之间用一个横线 - 相连;O标签后没有类型标签 例如: S-Nh 表示单独一个词构成了人名。 :param words: 分词结果 list :param postags: 词性标注结果 list :return: 命名实体标注结果 list """ netags = self.recognizer.recognize(words, postags) return list(netags) def parse(self, words, postags): """ 依存句法分析 :param words: 分词结果 list :param postags: 词性标注结果 list :return: ltp原生结果 (arc.head, arc.relation) for arc in arcs ROOT节点的索引是0,第一个词开始的索引依次为1、2、3 arc.relation 表示依存弧的关系。 arc.head 表示依存弧的父节点词的索引,arc.relation 表示依存弧的关系。 例: inputs: words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] output: 4:SBV 4:SBV 4:ADV 0:HED 输出格式为 head:relation """ arcs = self.parser.parse(words, postags) return arcs def label(self, words, postags, arcs): """ 语义角色标注 :param words: 分词结果 list :param postags: 词性标注结果 list :param arcs: 依存句法分析结果 ltp :return: ltp原生结果 (arg.name, arg.range.start, arg.range.end) for arg in role.arguments 第一个词开始的索引依次为0、1、2 返回结果 roles 是关于多个谓词的语义角色分析的结果。由于一句话中可能不含有语义角色,所以 结果可能为空。role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角 色。arg.name 表示语义角色类型,arg.range.start 表示该语义角色起始词位置的索引, arg.range.end 表示该语义角色结束词位置的索引。 例: inputs: words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] arcs 使用依存句法分析的结果 output: 3 A0:(0,0)A0:(1,1)ADV:(2,2) 由于结果输出一行,所以“元芳你怎么看”有一组语义角色。 其谓词索引为3,即“看”。 这个谓词有三个语义角色范围分别是: (0,0)即“元芳”,(1,1)即“你”,(2,2)即“怎么”,类型分别是A0、A0、ADV。 """ roles = self.labeller.label(words, postags, arcs) return roles def get_name_entity(self, sentence, entity_type): """ 获取句子中特定的命名实体集 :param sentence: 待分析句子 :param entity_type: 待分析命名实体类型,可选值 :return: """ words = self.segment(sentence) postags = self.postag(words) ne_tags = self.recognize(words, postags) sentence_len = len(words) ret_entity = set() entity_pattern = "" for i in range(sentence_len): if (ne_tags[i] == 'B-' + entity_type) or (ne_tags[i] == 'B-' + entity_type): entity_pattern += words[i] elif (ne_tags[i] == 'E-' + entity_type) or (ne_tags[i] == 'S-' + entity_type): entity_pattern += words[i] ret_entity.add(entity_pattern) entity_pattern = "" return list(ret_entity)
import os LTP_DATA_DIR="D:\myprojects\LTP\ltp_data_v3.4.0" cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') #命名实体识别 srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') from pyltp import Segmentor,Postagger,Parser,NamedEntityRecognizer,SementicRoleLabeller segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) labeller = SementicRoleLabeller() labeller.load(srl_model_path) line='手机外型很漂亮,屏幕也不错,就是太容易发烫了,电池不耐用,这些都是预想到的,我很少玩游戏就还好。喇叭真的太垃圾了。' words = list(segmentor.segment(line)) postags = list(postagger.postag(words)) arcs = parser.parse(words, postags) # 句法分析 netags = recognizer.recognize(words, postags) # 命名实体识别 roles = labeller.label(words, postags, netags, arcs)
def test_ltp(): from pyltp import Segmentor segmentor = Segmentor() #segmentor.load('/Users/a000/Downloads/ltp-models/3.3.2/ltp_data.model') segmentor.load('/Users/a000/git/ltp_data/cws.model') words = segmentor.segment('元芳你怎么看') words = segmentor.segment('这本书很好, 我喜欢iphone, 1.5') words = segmentor.segment('张子萱怀孕了') words = segmentor.segment('我有一本书') words = segmentor.segment('今天是2017年3月30日, 清朝的官员') words = segmentor.segment('蚂蚁金服近日上市') words = segmentor.segment('国家主席习近平抵达美国佛罗里达州') words = segmentor.segment('独家|你想要的胸以下全是腿, 科切拉潮人用不') total_txt = '<a href=\"http://deeporiginalx.com/search.html#sw=%E7%AC%AC%E4%B8%80%E7%99%BD%E9%93%B6%E7%BD%91\" target=\"_blank\">第一白银网</a>4月19日讯<a href=\"http://deeporiginalx.com/search.html#sw=%E7%8E%B0%E8%B4%A7%E7%99%BD%E9%93%B6\" target=\"_blank\">现货白银</a>今日早盘走势受到美元反弹影响继续走软,目前交投于18.2一线,本周二美国总统特朗普再次提及税改政策,并且宣称将会以“迅雷不及掩耳之势”落地,据小编分析,税改落地将会利好美国经济,从而利好美元,打压白银走势,但问题是,3月份连医改都进展不顺,税改会通过吗?(<a href=\"http://deeporiginalx.com/search.html#sw=%E7%BC%96%E8%BE%91%E6%8E%A8%E8%8D%90%EF%BC%9A%E6%9C%AA%E6%9D%A5%E7%99%BD%E9%93%B6%E8%B5%B0%E5%8A%BF%E5%88%86%E6%9E%90\" target=\"_blank\"><strong><span>编辑推荐:未来白银走势分析</span></strong></a>' total_txt = "<span class=\"article_src\">游民星空</span>2017-04-09<span>阅读原文</span>" soup = BeautifulSoup(total_txt, 'lxml') total_txt = soup.get_text() print total_txt print type(total_txt) words = segmentor.segment(total_txt.encode('utf-8')) #words = segmentor.segment(s) for i in words: print i import jieba w_jieba = jieba.cut('独家|你想要的胸以下全是腿, 科切拉潮人用不') print '!!!!!' for i in w_jieba: print i from pyltp import Postagger poser = Postagger() poser.load('/Users/a000/git/ltp_data/pos.model') #words_pos = poser.postag(words) #for i in xrange(len(words_pos)): # print words[i] # print words_pos[i] s1 = '张继科:脚伤恢复七八成 现在不是想退役的时候' s2 = '张继科:脚伤恢复八成 现在还不是退役的时候' #s2 = '张继科和马龙:脚伤恢复八成 现在还不是退役的时候' s3 = '张继科:脚伤已恢复7-8成 现在还不是退役的时候' s4 = '国际乒联排名:马龙丁宁占据榜首 张继科第四' s5 = '国际乒联公布排名:马龙丁宁第一 张继科第四' s6 = '国家主席习近平抵达美国佛罗里达州' s7 = '习近平抵达美国佛罗里达州' s8 = '习近平抵达美国佛罗里达州 同特朗普会晤' s9 = '习近平抵达美国佛罗里达州 将与特朗普举行会晤' s10 = '习近平抵达美国 将同特朗普举行会晤' s11 = '习近平抵达美国佛罗里达州 将同特朗普举行中美元首会晤' s12 = '【V观】习近平引用芬兰谚语:没有人的开拓就不会有路' s13 = '习近平引用芬兰谚语:没有人的开拓就不会有路' s14 = '习近平就圣彼得堡地铁发生爆炸造成伤亡向普京致慰问电' # s15 = '习近平就圣彼得堡地铁爆炸事件向普京致慰问电' #15135383 ss16 = '习近平就圣彼得堡市地铁发生爆炸造成严重人员伤亡向普京致慰问电' #15130013 ss17 = '习近平就圣彼得堡市地铁爆炸向普京致慰问电' #15127277 s16 = '习近平离京对芬兰进行国事访问并赴美国举行中美元首会晤' #15131991 s17 = '习近平离京对芬兰进行国事访问并赴美举行中美元首会晤' #15132864 s18 = '习近平离京对芬兰共和国进行国事访问并赴美国佛罗里达州举行中美元首会晤' #15131971 ws1 = segmentor.segment(s6) ws2 = segmentor.segment(s7) print ' '.join(ws1) print ' '.join(ws2) pos1 = poser.postag(ws1) pos2 = poser.postag(ws2) print ' '.join(pos1) print ' '.join(pos2) from pyltp import NamedEntityRecognizer reco = NamedEntityRecognizer() reco.load('/Users/a000/git/ltp_data/ner.model') ne1 = reco.recognize(ws1, pos1) ne2 = reco.recognize(ws2, pos2) print ' '.join(ne1) print ' '.join(ne2) from pyltp import Parser parser = Parser() parser.load('/Users/a000/git/ltp_data/parser.model') arc1 = parser.parse(ws1, pos1) arc2 = parser.parse(ws2, pos2) print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc1) print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc2)
def getRelation(paragraph): """ paragraph: a list of string, each string is a sentence return: a list of relations and a dict which records the number of occurrence of differents DSNF """ relations = [] dict_DSNF = { 'num_DSNF1': 0, 'num_DSNF2': 0, 'num_DSNF3': 0, 'num_DSNF7': 0, } segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) for iteration, sentence in enumerate(paragraph): sentence = SentenceSplitter.split(sentence)[0] words = segmentor.segment(sentence) # print("\t".join(words)) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # print("\t".join(postags)) arcs = parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) netags = recognizer.recognize(words, postags) # print("\t".join(netags)) # labeller = SementicRoleLabeller() # labeller.load(os.path.join(MODELDIR, "pisrl.model")) # roles = labeller.label(words, postags, arcs) # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) entityList = findEntities(netags) # print(entityList) entities = [] for i in entityList: l = '' for j in i: l += words[j] entities.append(l) DSNF1_ret = DSNF1(arcs, entityList, words, netags) DSNF2_ret = DSNF2(arcs, entityList, words) DSNF3_ret = DSNF3(arcs, entityList, words, postags) DSNF7_ret = DSNF7(arcs, entityList, words) # print("DSNF1 result: ", DSNF1_ret) # print("DSNF2 result: ", DSNF2_ret) # print("DSNF3 result: ", DSNF3_ret) # print("DSNF7 result: ", DSNF7_ret) relation = [] for r in DSNF1_ret: dict_DSNF['num_DSNF1'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF2_ret: dict_DSNF['num_DSNF2'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF3_ret: dict_DSNF['num_DSNF3'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF7_ret: dict_DSNF['num_DSNF7'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) if len(relation) > 0: print("evaluate the " + str(iteration + 1) + "-th sentences") print("entities in " + str(iteration + 1) + "-th sentence : ", entities) for one in relation: r = one[0] data = {'sentence': sentence, 'kg': [r[0], r[1], r[2]]} # print('r',r) key = get_key(data) old = DB.kg_mark.find_one({"_id": key}) if old == None: kg.mark_sentence(key, data) else: print("已经存在跳过") continue print(one) p, softmax = pre(data) print("with entities relation: ", r) print("预测:", p, "概率:", softmax) data['label'] = p data['state'] = '4' #设置状态4独立开来 print(data) # if len(relation)==3: # print("关系",relation[1],relation[2],relation[0]) print("--" * 30) segmentor.release() postagger.release() parser.release() recognizer.release() # labeller.release() return relations, dict_DSNF
for k, v in type2questions.items(): print(k, len(v)) for i in v[:10]: print(i) with open('../data/question_type.txt', 'w') as f_out: for k, v in type2questions.items(): f_out.write(k + '\n') for i in v: tmp = ' '.join(i[0]) f_out.write(tmp + '\t' + '\t'.join(i[1:]) + '\n') if __name__ == '__main__': # test('2006年7月27日,360安全卫士正式推出。') # get_all_questions() my_parser = Parser() # analysis_questions(my_parser) # my_parser.get_question_type('缓刑适用于几年以下的有期徒刑') my_parser.read_train_set('../data/BoP2017-DBQA.train.txt') count = 0 for i in range(len(my_parser.articles)): res = my_parser.analysis_question(i, debug=False) # for i in range(10): # res = my_parser.analysis_question(i, debug=True) if res == 0: count += 0 else: count += 1.0 / res print('score', count / len(my_parser.articles)) # my_parser.analysis_question(0)
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' #LTP模型文件目录 def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir #初始化分词器 #使用jieba分析,将抽取出的所有实体,作为词典加入jieba中 for entity in all_entity: jieba.add_word(entity, 100000) jieba.add_word("天府永藏展", 100000) # jieba.add_word("始建于",100000) # pynlpir.open()#初始化分词器 # #添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 # files = os.listdir(user_dict_dir) # for file in files: # file_path = os.path.join(user_dict_dir,file) # #文件夹则跳过 # if os.path.isdir(file): # continue # with open(file_path,'r',encoding = 'utf-8') as f: # line = f.readline() # while line: # word = line.strip('\n').strip() # pynlpir.nlpir.AddUserWord(c_char_p(word.encode())) # line = f.readline() #加载ltp模型 #词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) #命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) #依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if postag_flag or ner_flag or parser_flag: #可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): """ 采用NLPIR进行分词处理 Args: Sentence:String,句子 entity_postag : dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemma:list,分词结果 """ #添加实体词典 # if entity_postag: # for entity in entity_postag: # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode())) # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))#单个用户加入示例 # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))#单个用户加入示例 #分词,不进行词性标注 result = jieba.cut(sentence) # pynlpir.close() # 释放 lemmas = [] for lemma in result: lemmas.append(lemma) # lemmas = pynlpir.segment(sentence,pos_tagging=False) #pynlpir.close() #释放 return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] #词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): #存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) #self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] #存储分词后的结果 postags = [] #存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) #命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] #分词结果 postags = [] #词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) #依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def getSPO1(self, sentence_list): for sentence in sentence_list: lemmas = nlp.segment(sentence) print(lemmas) # 词性标注测试 print('***' + '词性标注测试' + '***') words = nlp.postag(lemmas) # for word in words: # print(word.to_string()) # print(words) # 命名实体识别与合并测试 print('***' + '命名实体识别与合并测试' + '***') words_netag = nlp.netag(words) # for word in words_netag: # print(word.to_string()) # 依存句法分析测试 print('***' + '依存句法分析测试' + '***') sentence = nlp.parse(words_netag) print(sentence.to_string()) verb = True # entity = "乾清宫" for item in sentence.words: if (item.head_word == None and item.lemma == verb) or ( item.lemma == verb and item.dependency == "COO" and item.head_word.head_word == None): relation_verb = item if item.head_word == None: verbId = item.ID elif item.head_word.head_word == None: verbId = item.ID verbId2 = item.head_word.ID O_dict = dict() S_dict = dict() OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.lemma] = SUB.ID if item.dependency == "VOB" and item.head_word.ID == verbId: OBJ = item O_dict[OBJ.lemma] = OBJ.ID if SUB == None: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.lemma] = SUB.ID if OBJ == None: for item in sentence.words: if item.dependency == "VOB" and item.head_word.ID == verbId2: OBJ = item O_dict[OBJ.lemma] = OBJ.ID OBJList = [] flag = True while flag == True: len1 = len(S_dict) len2 = len(O_dict) for item in sentence.words: if SUB != None and item.head_word != None: SUBList = S_dict.values() if item.head_word.ID in SUBList and ( item.dependency == "ATT" or item.dependency == "COO"): SUBATT = item S_dict[SUBATT.lemma] = SUBATT.ID if OBJ != None and item.head_word != None: OBJList = O_dict.values() if item.head_word.ID in OBJList and ( item.dependency == "ATT"): OBJATT = item O_dict[OBJATT.lemma] = OBJATT.ID if len(S_dict) != len1 or len(O_dict) != len2: flag = True else: flag = False O_dict = sorted(O_dict.items(), key=lambda item: item[1]) S_dict = sorted(S_dict.items(), key=lambda item: item[1]) Object = "" Subject = "" for i in O_dict: Object += i[0] for i in S_dict: Subject += i[0] if SUB != None: print((Subject, verb, Object)) S_dict2 = dict() O_dict2 = dict() SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: # if SUB == None or SUB.lemma != entity: SUB_COO = item S_dict2[SUB_COO.lemma] = SUB_COO.ID if item.head_word != None: if item.dependency == "COO" and item.head_word.ID == OBJ.ID: OBJ_COO = item O_dict2[OBJ_COO.lemma] = OBJ_COO.ID flag = True while flag == True: len1 = len(S_dict2) len2 = len(O_dict2) for item in sentence.words: if SUB_COO != None and item.head_word != None: SUBList = S_dict2.values() if item.head_word.ID in SUBList and item.dependency == "ATT": SUBATT = item S_dict2[SUBATT.lemma] = SUBATT.ID if OBJ_COO != None and item.head_word != None: OBJList = O_dict2.values() if item.head_word.ID in OBJList and item.dependency == "ATT": OBJATT = item O_dict2[OBJATT.lemma] = OBJATT.ID if len(S_dict2) != len1 or len(O_dict2) != len2: flag = True else: flag = False O_dict2 = sorted(O_dict2.items(), key=lambda item: item[1]) S_dict2 = sorted(S_dict2.items(), key=lambda item: item[1]) if len(O_dict2) or len(S_dict2): if len(O_dict2) == 0: O_dict2 = O_dict if len(S_dict2) == 0: S_dict2 = S_dict Object = "" Subject = "" for i in O_dict2: Object += i[0] for i in S_dict2: Subject += i[0] if SUB != None: print((Subject, verb, Object)) def getSPO2(self, sentence_list): for sentence in sentence_list: lemmas = nlp.segment(sentence) print(lemmas) # 词性标注测试 print('***' + '词性标注测试' + '***') words = self.postag(lemmas) # for word in words: # print(word.to_string()) # print(words) # 命名实体识别与合并测试 print('***' + '命名实体识别与合并测试' + '***') words_netag = nlp.netag(words) # for word in words_netag: # print(word.to_string()) # 依存句法分析测试 print('***' + '依存句法分析测试' + '***') sentence = nlp.parse(words_netag) print(sentence.to_string()) # verb = True # entity = "乾清宫" for item in sentence.words: if (item.head_word == None and item.postag == "v") or ( item.postag == "v" and item.dependency == "COO" and item.head_word.head_word == None): relation_verb = item if item.head_word == None: verbId = item.ID elif item.head_word.head_word == None: verbId = item.ID verbId2 = item.head_word.ID O_dict = dict() S_dict = dict() OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.lemma] = SUB.ID if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == verbId): OBJ = item O_dict[OBJ.lemma] = OBJ.ID if SUB == None: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.lemma] = SUB.ID if OBJ == None: for item in sentence.words: if item.dependency == "VOB" and item.head_word.ID == verbId2: OBJ = item O_dict[OBJ.lemma] = OBJ.ID OBJList = [] flag = True while flag == True: len1 = len(S_dict) len2 = len(O_dict) for item in sentence.words: if SUB != None and item.head_word != None: SUBList = S_dict.values() if item.head_word.ID in SUBList and ( item.dependency == "ATT" or item.dependency == "COO"): SUBATT = item S_dict[SUBATT.lemma] = SUBATT.ID if OBJ != None and item.head_word != None: OBJList = O_dict.values() if item.head_word.ID in OBJList and ( item.dependency == "ATT"): OBJATT = item O_dict[OBJATT.lemma] = OBJATT.ID if len(S_dict) != len1 or len(O_dict) != len2: flag = True else: flag = False O_dict = sorted(O_dict.items(), key=lambda item: item[1]) S_dict = sorted(S_dict.items(), key=lambda item: item[1]) Object = "" Subject = "" for i in O_dict: Object += i[0] for i in S_dict: Subject += i[0] if SUB != None: print((Subject, relation_verb.lemma, Object)) S_dict2 = dict() O_dict2 = dict() SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: # if SUB == None or SUB.lemma != entity: SUB_COO = item S_dict2[SUB_COO.lemma] = SUB_COO.ID if item.head_word != None and OBJ != None: if item.dependency == "COO" and item.head_word.ID == OBJ.ID: OBJ_COO = item O_dict2[OBJ_COO.lemma] = OBJ_COO.ID flag = True while flag == True: len1 = len(S_dict2) len2 = len(O_dict2) for item in sentence.words: if SUB_COO != None and item.head_word != None: SUBList = S_dict2.values() if item.head_word.ID in SUBList and item.dependency == "ATT": SUBATT = item S_dict2[SUBATT.lemma] = SUBATT.ID if OBJ_COO != None and item.head_word != None: OBJList = O_dict2.values() if item.head_word.ID in OBJList and item.dependency == "ATT": OBJATT = item O_dict2[OBJATT.lemma] = OBJATT.ID if len(S_dict2) != len1 or len(O_dict2) != len2: flag = True else: flag = False O_dict2 = sorted(O_dict2.items(), key=lambda item: item[1]) S_dict2 = sorted(S_dict2.items(), key=lambda item: item[1]) if len(O_dict2) or len(S_dict2): if len(O_dict2) == 0: O_dict2 = O_dict if len(S_dict2) == 0: S_dict2 = S_dict Object = "" Subject = "" for i in O_dict2: Object += i[0] for i in S_dict2: Subject += i[0] if SUB != None: print((Subject, relation_verb.lemma, Object)) def getSPO(self, sentence_list): for sentence in sentence_list: print(sentence) lemmas = self.segment(sentence) # print(lemmas) # 词性标注测试 # print('***' + '词性标注测试' + '***') words = self.postag(lemmas) # for word in words: # print(word.to_string()) # print(words) # 命名实体识别与合并测试 # print('***' + '命名实体识别与合并测试' + '***') words_netag = self.netag(words) # for word in words_netag: # print(word.to_string()) # 依存句法分析测试 # print('***' + '依存句法分析测试' + '***') sentence = self.parse(words_netag) # print(sentence.to_string()) # verb = True # entity = "乾清宫" for item in sentence.words: if (item.head_word == None and item.postag == "v") or ( item.postag == "v" and item.dependency == "COO" and item.head_word.head_word == None): relation_verb = item if item.head_word == None: verbId = item.ID verbId2 = None elif item.head_word.head_word == None: verbId = item.ID verbId2 = item.head_word.ID O_dict = dict() S_dict = dict() OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verbId): OBJ = item O_dict[OBJ.ID] = OBJ.lemma # if item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" \ # and item.head_word.head_word.ID == verbId: # verb_p = item.head_word # O_dict[OBJ.lemma] = OBJ.ID if SUB == None: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if OBJ == None: for item in sentence.words: if item.dependency == "VOB" and item.head_word.ID == verbId2: OBJ = item O_dict[OBJ.ID] = OBJ.lemma OBJList = [] flag = True while flag == True: len1 = len(S_dict) len2 = len(O_dict) for item in sentence.words: if SUB != None and item.head_word != None: SUBList = S_dict.keys() if item.head_word.ID in SUBList and ( item.dependency == "ATT" or item.dependency == "COO"): SUBATT = item S_dict[SUBATT.ID] = SUBATT.lemma if OBJ != None and item.head_word != None: OBJList = O_dict.keys() if item.head_word.ID in OBJList and ( item.dependency == "ATT" or item.dependency == "COO"): OBJATT = item # if item.dependency!="COO": O_dict[OBJATT.ID] = OBJATT.lemma # else: # O_dict[OBJATT.ID] = OBJATT.lemma+" " if len(S_dict) != len1 or len(O_dict) != len2: flag = True else: flag = False O_dict = sorted(O_dict.items(), key=lambda item: item[0]) S_dict = sorted(S_dict.items(), key=lambda item: item[0]) Object = "" Subject = "" for i in O_dict: Object += i[1] for i in S_dict: Subject += i[1] if SUB != None: print((Subject, relation_verb.lemma, Object)) S_dict2 = dict() O_dict2 = dict() SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: # if SUB == None or SUB.lemma != entity: SUB_COO = item S_dict2[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ != None: if item.dependency == "COO" and item.head_word.ID == OBJ.ID: OBJ_COO = item O_dict2[OBJ_COO.ID] = OBJ_COO.lemma flag = True while flag == True: len1 = len(S_dict2) len2 = len(O_dict2) for item in sentence.words: if SUB_COO != None and item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and item.dependency == "ATT": SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if OBJ_COO != None and item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and item.dependency == "ATT": OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma if len(S_dict2) != len1 or len(O_dict2) != len2: flag = True else: flag = False O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) if len(O_dict2) or len(S_dict2): if len(O_dict2) == 0: O_dict2 = O_dict if len(S_dict2) == 0: S_dict2 = S_dict Object = "" Subject = "" for i in O_dict2: Object += i[1] for i in S_dict2: Subject += i[1] if SUB != None: print((Subject, relation_verb.lemma, Object))
class HIT_LTP(): def __init__(self, MODELDIR): self.MODELDIR = MODELDIR self.segmentor = Segmentor() self.segmentor.load(os.path.join(MODELDIR, "cws.model")) # postags = 863 词性标注集 # https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3 self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(MODELDIR, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, "parser.model")) self.srler = SementicRoleLabeller() self.srler.load(os.path.join(MODELDIR, "pisrl.model")) def ori_segment(self, sentence): words = self.segmentor.segment(sentence) words = list(words) return words def ori_pos(self, words): postags = self.postagger.postag(words) postags = list(postags) return postags def ori_ner(self, words, postags): netags = self.recognizer.recognize(words, postags) netags = list(netags) return netags def ori_parser(self, words, postags): arcs = self.parser.parse(words, postags) arcs = [[arc.head, arc.relation] for arc in arcs] return arcs # 在哈工大 ltp 中,默认为最细粒度分词 def std_seg(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) terms = [] offe = 0 for word, postag in zip(words, postags): term = {} term['word'] = word term['nature'] = postag term['offset'] = offe offe += len(word) terms.append(term) return terms # 加入 ner 的分词,相当于粗粒度分词 def nlp_seg(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) words = list(words) postags = list(postags) netags = list( netags ) # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O chunks = self.get_ner_info( netags ) # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)] num_ners = len(chunks) # 得到加入 ner 的 words_ 与 postags_ words_ = [] postags_ = [] if num_ners != 0: ner_index = 0 length = 0 for i in range(len(words)): j = i + length if j < len(words): ner_type = chunks[ner_index][0] ner_start = chunks[ner_index][1] ner_end = chunks[ner_index][2] word = words[j] postag = postags[j] if j == ner_start: for k in range(ner_start + 1, ner_end): word += words[k] length += 1 postag = ner_type.lower() if ner_index < len(chunks) - 1: ner_index += 1 words_.append(word) postags_.append(postag) terms = [] offe = 0 for word, postag in zip(words_, postags_): term = {} term['word'] = word term['nature'] = postag term['offset'] = offe offe += len(word) terms.append(term) return terms def std_analysis(self, sentence): data = {} words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) words = list(words) postags = list(postags) arcs = self.parser.parse(words, postags) arcs_ = [[arc.head, arc.relation] for arc in arcs] child_dict_list = self.build_parse_child_dict(words, postags, arcs) data['words'] = words data['postags'] = postags data['arcs'] = arcs_ data['child_dict_list'] = child_dict_list return data def nlp_analysis(self, sentence): data = {} words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) words = list(words) postags = list(postags) netags = list( netags ) # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O chunks = self.get_ner_info( netags ) # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)] num_ners = len(chunks) # 得到加入 ner 的 words_ 与 postags_ words_ = [] postags_ = [] if num_ners != 0: ner_index = 0 length = 0 for i in range(len(words)): j = i + length if j < len(words): ner_type = chunks[ner_index][0] ner_start = chunks[ner_index][1] ner_end = chunks[ner_index][2] word = words[j] postag = postags[j] if j == ner_start: for k in range(ner_start + 1, ner_end): word += words[k] length += 1 postag = ner_type.lower() if ner_index < len(chunks) - 1: ner_index += 1 words_.append(word) postags_.append(postag) arcs = self.parser.parse(words_, postags_) arcs_ = [[arc.head, arc.relation] for arc in arcs] child_dict_list = self.build_parse_child_dict(words_, postags_, arcs) data['words'] = words_ data['postags'] = postags_ data['arcs'] = arcs_ data['child_dict_list'] = child_dict_list return data # 基于“细粒度词”的 ner def ner(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) words = list(words) postags = list(postags) netags = list( netags ) # B-Ni E-Ni O O S-Ni O S-Nh O B-Ni E-Ni O S-Nh O O S-Nh O O O O S-Ns O S-Ns O chunks = self.get_ner_info( netags ) # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)] ner_info = [] for chunk in chunks: ner_type = chunk[0] ner_start = chunk[1] ner_end = chunk[2] ner_name = ''.join(words[ner_start:ner_end]) ner_offe = 0 for i in range(len(words)): if i == ner_start: break ner_offe += len(words[i]) ner_info.append({ 'ner_name': ner_name, 'ner_type': ner_type, 'ner_offe': ner_offe }) return ner_info def parser(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) arcs = self.parser.parse(words, postags) arcs = [[arc.head, arc.relation] for arc in arcs] return arcs # 可能会有多个 def get_core_words(self, sentence, words=None, postags=None): core_words_info = [] core_words_indexs = [] if words is None: words = self.segmentor.segment(sentence) words = list(words) if postags is None: postags = self.postagger.postag(words) postags = list(postags) arcs = self.parser.parse(words, postags) arcs_ = [[arc.head, arc.relation] for arc in arcs] child_dict_list = self.build_parse_child_dict(words, postags, arcs) for i in range(len(arcs_)): if arcs_[i][1] == 'HED': core_words_indexs.append(i) self.complete_core_words(core_words_indexs, i, child_dict_list) for i in core_words_indexs: word = words[i] offe = len(''.join(words[0:i])) temp_dic = {} temp_dic['word'] = word temp_dic['offe'] = offe core_words_info.append(temp_dic) return core_words_info # 为了更灵活,words = None, postags = None 可解耦 def get_srl_triple(self, sentence, words=None, postags=None): data = {} if words is None: words = self.segmentor.segment(sentence) words = list(words) if postags is None: postags = self.postagger.postag(words) postags = list(postags) netags = self.recognizer.recognize(words, postags) netags = list(netags) arcs = self.parser.parse(words, postags) arcs_ = [[arc.head, arc.relation] for arc in arcs] roles = self.srler.label(words, postags, arcs) # 可能有多组角色 triple_info = [] for role in roles: tem_dic = {} triple = ['', '', ''] TMP = '' LOC = '' role = role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ]) predicate = words[role[0]] triple[1] = predicate args = role[1].split(")") args.remove('') for ele in args: ele = ele.split(":") if ele[0] == "A0": index = ele[1][1:].split(",") A0 = words[int(index[0]):int(index[1]) + 1] A0_str = "".join(A0) triple[0] = A0_str if ele[0] == "A1": index = ele[1][1:].split(",") A1 = words[int(index[0]):int(index[1]) + 1] A1_str = "".join(A1) triple[2] = A1_str if ele[0] == "TMP": index = ele[1][1:].split(",") tmp = words[int(index[0]):int(index[1]) + 1] tmp_str = "".join(tmp) TMP = tmp_str if ele[0] == "LOC": index = ele[1][1:].split(",") loc = words[int(index[0]):int(index[1]) + 1] loc_str = "".join(loc) LOC = loc_str tem_dic['role'] = role tem_dic['predicate'] = predicate tem_dic['triple'] = triple tem_dic['TMP'] = TMP tem_dic['LOC'] = LOC triple_info.append(tem_dic) chunks = self.get_ner_info( netags ) # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)] ner_info = [] for chunk in chunks: ner_type = chunk[0] ner_start = chunk[1] ner_end = chunk[2] ner_name = ''.join(words[ner_start:ner_end]) ner_offe = 0 for i in range(len(words)): if i == ner_start: break ner_offe += len(words[i]) ner_info.append({ 'ner_name': ner_name, 'ner_type': ner_type, 'ner_offe': ner_offe }) data['words'] = words data['postags'] = postags data['arcs'] = arcs_ data['triple_info'] = triple_info data['ner_info'] = ner_info return data def get_parser_triple(self, sentence, words=None, postags=None): data = {} if words is None: words = self.segmentor.segment(sentence) words = list(words) if postags is None: postags = self.postagger.postag(words) postags = list(postags) netags = self.recognizer.recognize(words, postags) netags = list(netags) arcs = self.parser.parse(words, postags) arcs_ = [[arc.head, arc.relation] for arc in arcs] child_dict_list = self.build_parse_child_dict(words, postags, arcs) triple_info = [] for index in range(len(postags)): # 抽取以谓词为中心的事实三元组 if postags[index] == 'v': child_dict = child_dict_list[index] # 主谓宾 if 'SBV' in child_dict and 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_dic = {} temp_dic['triple'] = [e1, r, e2] temp_dic['type'] = '主谓宾' triple_info.append(temp_dic) # 定语后置,动宾关系 # 进行v 正式 访问vob 的 缅甸国务资政昂山素季sbv # 动宾,补主语 elif arcs[index].relation == 'ATT': if 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r + e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: temp_dic = {} temp_dic['triple'] = [e1, r, e2] temp_dic['type'] = '补主' triple_info.append(temp_dic) # 含有介宾关系的主谓动补关系 # 哈立德sbv 居住 在cmp(动补结构) 土耳其pob # 主谓,补宾语 elif 'SBV' in child_dict and 'CMP' in child_dict: #e1 = words[child_dict['SBV'][0]] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if 'POB' in child_dict_list[cmp_index]: e2 = self.complete_e( words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) temp_dic = {} temp_dic['triple'] = [e1, r, e2] temp_dic['type'] = '补宾' triple_info.append(temp_dic) # 主谓 elif 'SBV' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] temp_dic = {} temp_dic['triple'] = [e1, r, ''] temp_dic['type'] = '主谓' triple_info.append(temp_dic) # 谓宾 elif 'VOB' in child_dict: r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_dic = {} temp_dic['triple'] = ['', r, e2] temp_dic['type'] = '谓宾' triple_info.append(temp_dic) # FOB 宾语前置 '中泰数字经济合作部级对话机制第一次会议在云南昆明召开' elif 'FOB' in child_dict: r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['FOB'][0]) temp_dic = {} temp_dic['triple'] = ['', r, e2] temp_dic['type'] = '宾前' triple_info.append(temp_dic) chunks = self.get_ner_info( netags ) # [('Ni', 0, 2), ('Ni', 4, 5), ('Nh', 6, 7), ('Ni', 8, 10), ('Nh', 11, 12), ('Nh', 14, 15), ('Ns', 19, 20), ('Ns', 21, 22)] ner_info = [] for chunk in chunks: ner_type = chunk[0] ner_start = chunk[1] ner_end = chunk[2] ner_name = ''.join(words[ner_start:ner_end]) ner_offe = 0 for i in range(len(words)): if i == ner_start: break ner_offe += len(words[i]) ner_info.append({ 'ner_name': ner_name, 'ner_type': ner_type, 'ner_offe': ner_offe }) core_words_info = self.get_core_words(sentence, words=words, postags=postags) triple_info, core_words_info_ = self.triple_info_futher_merge( core_words_info, triple_info) data['words'] = words data['postags'] = postags data['arcs'] = arcs_ data['core_words_info'] = core_words_info_ data['triple_info'] = triple_info data['ner_info'] = ner_info return data # 当三元组的谓词在 core_words_info 中时(谓词并列关系 COO): # 如果谓词连续,且一个 triple 缺主语,一个缺宾语,则合并两谓词与主与宾语 # 如果谓词不连续,且 triple 存在主语缺失或是宾语缺失的情况,则分别补全主语和宾语 def triple_info_futher_merge(self, core_words_info, triple_info): core_words_info_ = [] for i in range(len(core_words_info) - 1): # 合并后需返回合并后的 core_words_info if core_words_info[i]['offe'] + len(core_words_info[ i + 1]['word']) == core_words_info[i + 1]['offe']: # 相邻 triple_ = ['', '', ''] condition = 0 #print('len(triple_info)',len(triple_info)) for j in range(len(triple_info)): #print('j', j) #print('triple_info', triple_info) if triple_info[j]['triple'][1] == core_words_info[i][ 'word']: if j + 1 < len(triple_info): if triple_info[j + 1]['triple'][1] == core_words_info[ i + 1]['word']: triple_[0] = triple_info[j]['triple'][0] triple_[2] = triple_info[j + 1]['triple'][2] triple_[1] = triple_info[j]['triple'][ 1] + triple_info[j + 1]['triple'][1] triple_dic = {} triple_dic['triple'] = triple_ triple_dic['type'] = '主谓宾' triple_info[j] = triple_dic condition = 1 break if condition == 1: core_words_info_.append({ 'word': triple_[1], 'offe': core_words_info[i]['offe'] }) else: core_words_info_.append(core_words_info[i]) else: # 不相邻 sub = '' obj = '' for triple in triple_info: if triple['triple'][0] != '': sub = triple['triple'][0] if triple['triple'][2] != '': obj = triple['triple'][2] for triple in triple_info: if triple['triple'][0] == '': triple['triple'][0] = sub if triple['triple'][2] == '': triple['triple'][2] = obj core_words_info_.append(core_words_info[i]) # 补最后一个 for i in range(len(core_words_info)): if i == len(core_words_info) - 1: core_words_info_.append(core_words_info[i]) # print(core_words_info) # print(triple_info) return triple_info, core_words_info_ def restart(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.srler.release() self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) self.srler = SementicRoleLabeller() self.srler.load(os.path.join(self.MODELDIR, "pisrl.model")) def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.srler.release() # =================== 以下为相关工具方法 =================== =================== =================== =================== def get_ner_type(self, tag_name): tag_class = tag_name.split('-')[0] # B I S E O tag_type = tag_name.split('-')[-1] # Ni Ns Nh return tag_class, tag_type def get_ner_info(self, netags): default = "O" chunks = [] # 定义 类别 和 起始索引 chunk_type, chunk_start = None, None for i, tok in enumerate(netags): # End of a chunk 1 if tok == default and chunk_type is not None: # Add a chunk. chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = None, None elif tok != default: tok_chunk_class, tok_chunk_type = self.get_ner_type(tok) # 处理 tok_chunk_class if tok_chunk_class != 'e' and tok_chunk_class != 'm': # 第一次... # start of a chunk if chunk_type is None: chunk_type, chunk_start = tok_chunk_type, i # End of a chunk + start of a chunk! elif tok_chunk_type != chunk_type or tok_chunk_class == "b" or tok_chunk_class == "s": chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = tok_chunk_type, i else: pass # end condition if chunk_type is not None: chunk = (chunk_type, chunk_start, len(netags)) chunks.append(chunk) return chunks def build_parse_child_dict(self, words, postags, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) return child_dict_list ''' # 1、ATT定中关系,2、动宾短语实体,3、从父节点向子节点遍历 def complete_e(self, words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix ''' ''' def complete_e(self, words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' postfix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) if 'COO' in child_dict: for i in range(len(child_dict['COO'])): postfix += self.complete_e(words, postags, child_dict_list, child_dict['COO'][i]) if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix ''' def complete_e(self, words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' postfix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) if 'COO' in child_dict: for i in range(len(child_dict['COO'])): if child_dict['COO'][ i] - word_index == 1: #如果并列的主语和宾语在原文中有分割,则用‘、’分割 # if postags[child_dict['COO'][i]]=='j':#考虑词性,可能不够全面 postfix += self.complete_e(words, postags, child_dict_list, child_dict['COO'][i]) else: postfix += '、' + self.complete_e( words, postags, child_dict_list, child_dict['COO'][i]) if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix # 完善 HED 的 COO 关系 def complete_core_words(self, core_words_indexs, hed_index, child_dict_list): if 'COO' in child_dict_list[hed_index].keys(): core_words_indexs += child_dict_list[hed_index]['COO'] for i in child_dict_list[hed_index]['COO']: self.complete_core_words(core_words_indexs, i, child_dict_list)
if __name__ == "__main__": args = parse_args() LTP_DATA_DIR = '../data/ltp_data' pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #分词 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') #语法分析 segmentor = Segmentor() segmentor.load(cws_model_path) postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 parser = Parser() parser.load(par_model_path) entity_name = "母婴" #sentence =[ "专门打电话来问我要不要买手机","最近想买部手机","我想入手一部索尼的手机,主要用于日常拍摄和毕业旅行"] #mode_list=['val','test','train'] trigger_words = [ '小孩', '孩子', '宝宝', '妈妈', '月子', '奶瓶', '幼儿', '新生儿', '儿童', '儿子', '女儿', '婴儿', '胎儿', '爸爸', '母爱', '麻麻', '避孕', '妊娠', '孕期', '孕妇', '母乳', '疫苗', '辣妈', '妈', '当妈', '怀孕', '宝妈', '母婴', '孕妈', '奶爸', '宝贝', '辅食', '奶粉', '男孩', '女孩', '男宝', '女宝', '女宝', '湿疹', '父母', '母亲', '父亲', '元素', '微量元素', '臭臭', '哺乳', '米粉', '父教', '产妇', '堕胎', '纸尿裤', '尿裤', '娃', '小儿', '尿不湿', '回奶', '断奶', '早教', '胎教', '吐奶', '待产', '宝', '童车', '孕前', '孕', '奶嘴', '早产', '冲奶', '育儿', '月嫂', '叶酸', '二胎', '吸乳', '乳汁', '产前', '产后', '奶水', '亲子装' ]
+------------+-----+----------------------------+----------------------------+ | 核心关系 | HED | head | 指整个句子的核心 | +------------+-----+----------------------------+----------------------------+ """ from __future__ import unicode_literals import logging from pyltp import Parser from . import ltp_model_loader from .ltp_cloud import dp_online __all__ = ['dependency_parsing'] # 加载模型 parser = Parser() ltp_model_loader.load(parser) def dependency_parsing(words, postags, online=False): """ 句法分析 :param words: 分词结果 :param postags: 词性标注结果 :return: 句法分析树 """ # online=True, 使用 ltp-cloud 做句法分析 if online: return dp_online(words, postags) # 使用本地 ltp 做句法分析 arcs = parser.parse([i.encode('utf-8') for i in words],
class LtpAnalysis(object): def __init__(self): self.postagger = Postagger() self.parser = Parser() self.parser.load(par_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) self.labeller = SementicRoleLabeller() self.labeller.load(srl_model_path) self.postagger.load_with_lexicon(pos_model_path, '/home/wangwei/conf/posttags.txt') def LtpRecon(self, sents): """ 分词,词性,句法,命名实体识别,语义识别 :param sents: :return: """ #分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] logger.info('\t'.join(words)) #词性 postags = self.postagger.postag(words) logger.info('\t'.join(postags)) #句法 arcs = self.parser.parse(words, postags) logger.info("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) #实体识别 netags = self.recognizer.recognize(words, postags) logger.info('\t'.join(netags)) #语义标注 roles = self.labeller.label(words, postags, arcs) for role in roles: print role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ]) self.words, self.postags, self.arcs, self.netags, self.roles = \ words, postags, arcs, netags, roles def deal_arc(self): drelation = {} #word_index:(arc.head,arc.relation) num = -1 for arc in self.arcs: num += 1 k = str(num) + '#' + (arc.head - 1) drelation[k] = arc.relation return self.drelation def vob(self, index): num = -1 for arc in self.arcs: num += 1 if arc.relation in ['VOB'] and (arc.head - 1) == index: return self.words[num] def att(self, att): num = -1 def post(self, target): """ 评价对象的扩展 ,解决ATT :param num: :return: """ obj = set() obj.add(target) num = 0 for arc in self.arcs: if (arc.head - 1) == target and arc.relation == 'ATT': obj.add(arc.head - 1) obj |= self.post(num) num += 1 return obj def analysis(self, sents): self.LtpRecon(sents) # self.deal_arc() num = -1 for arc in self.arcs: num += 1 if arc.relation == 'SBV': vob_word = self.vob(arc.head - 1) att = self.post(num) attword = ''.join([self.words[i] for i in att if i != num]) print attword, self.words[num], self.words[arc.head - 1], vob_word
LTP_DATA_DIR = './ltp_data_v3.4.0' # ltp模型目录的路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model') # 语义角色标注模型目录路径 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 def cht_to_chs(line): # 转换繁体到简体 line = Converter('zh-hans').convert(line) line.encode('utf-8') return line def load_data(): # 从手动标注集加载数据 words = [] labA = []
class SentenceParser: """ A class for sentence analysis """ def __init__(self): """ Load remote lexicon and ltp model """ self.temp_lexicon = "temp_lexicon" self.fetch_lexicon() self.sentence_splitter = SentenceSplitter() self.segment = Segmentor() self.segment.load_with_lexicon(CWS_MODEL, self.temp_lexicon) self.pos = Postagger() self.pos.load_with_lexicon(POS_MODEL, self.temp_lexicon) self.tree_parser = Parser() self.tree_parser.load(PARSER_MODEL) def fetch_lexicon(self): """ Load lexicon and write to local """ res = db.fetch_lexicon() with open(self.temp_lexicon, "w", encoding="utf8") as f: for item in res: token, synonym, norm_token, pos = item pos = pos.replace(",", " ") token = "%s %s" % (token, pos) norm_token = "%s %s" % (norm_token, pos) if synonym: synonym = "\n".join( list( map(lambda x: "%s %s" % (x, pos), synonym.split(",")))) f.write("%s\n%s\n%s\n" % (token, synonym, norm_token)) else: f.write("%s\n%s\n" % (token, norm_token)) def seg_sentence(self, text): """ Segment sentence by punctuation :param text: raw string :return: vector of sentences, use list() to covert as [sentence0, sentence1, ...] """ return self.sentence_splitter.split(text) def seg_token(self, text): """ Segment token by model and lexicon :param text: raw string :return: vector of tokens use list() to convert as [token0, token1, ...] """ return self.segment.segment(text) def pos_tag(self, text): """ Tag position of speech for text by model and lexicon :param text: raw string :return: vector of pos, use list() to convert as [pos0, pos1, ...] """ tokens = self.seg_token(text) return self.pos.postag(tokens) def parse_list(self, text): """ Parse the sentence as a list of word node :param text: raw string :return: a list of word node """ result = [] words = self.seg_token(text) pos_list = self.pos.postag(words) if len(words) == 0 or len(pos_list) == 0: return result arcs = self.tree_parser.parse(words, pos_list) nodes = list(map(lambda x: (x.head, x.relation), arcs)) for token, pos, relation in zip(words, pos_list, nodes): word_node = WordNode(token, pos, relation[1]) result.append(word_node) return result def parse_tree(self, text): """ Parse the sentence as a dependence tree of word node :param text: raw string :return: a dependence tree of word node """ words = self.seg_token(text) pos = self.pos.postag(words) if len(words) == 0 or len(pos) == 0: return WordNode("", "", "", None) arcs = self.tree_parser.parse(words, pos) nodes = list(map(lambda x: (x.head, x.relation), arcs)) root_idx = find_x(nodes, 0) root = WordNode(words[root_idx[0]], pos[root_idx[0]], nodes[root_idx[0]][1]) tree = {root_idx[0]: root} queue = root_idx while len(queue): next_idx = queue.pop() for idx in find_x(nodes, next_idx + 1): queue.insert(0, idx) new_node = WordNode(words[idx], pos[idx], nodes[idx][1]) tree[next_idx].next.append(new_node) tree[idx] = new_node return root
class SentenceParser: """ A class for sentence analysis """ def __init__(self): """ Load remote lexicon and ltp model """ self.temp_lexicon = "temp_lexicon" self.fetch_lexicon() self.sentence_splitter = SentenceSplitter() self.segment = Segmentor() self.segment.load_with_lexicon(CWS_MODEL, self.temp_lexicon) self.pos = Postagger() self.pos.load_with_lexicon(POS_MODEL, self.temp_lexicon) self.tree_parser = Parser() self.tree_parser.load(PARSER_MODEL) self.rules = IterDocument("data/rule") def fetch_lexicon(self): """ Load lexicon and write to local """ res = db.fetch_lexicon() with open(self.temp_lexicon, "w", encoding="utf8") as f: for item in res: token, synonym, norm_token, pos = item pos = pos.replace(",", " ") token = "%s %s" % (token, pos) norm_token = "%s %s" % (norm_token, pos) if synonym: synonym = "\n".join( list( map(lambda x: "%s %s" % (x, pos), synonym.split(",")))) f.write("%s\n%s\n%s\n" % (token, synonym, norm_token)) else: f.write("%s\n%s\n" % (token, norm_token)) def seg_sentence(self, text): """ Segment sentence by punctuation :param text: raw string :return: vector of sentences, use list() to covert as [sentence0, sentence1, ...] """ return self.sentence_splitter.split(text) def seg_token(self, text): """ Segment token by model and lexicon :param text: raw string :return: vector of tokens use list() to convert as [token0, token1, ...] """ return self.segment.segment(text) def pos_tag(self, text): """ Tag position of speech for text by model and lexicon :param text: raw string :return: vector of pos, use list() to convert as [pos0, pos1, ...] """ tokens = self.seg_token(text) return self.pos.postag(tokens) def parse_list(self, text, need_info=False): """ Parse the sentence as a list of word node :param need_info: whether need extra info :param text: raw string :return: a list of word node """ result = [] words = self.seg_token(text) pos_list = self.pos.postag(words) if len(words) == 0 or len(pos_list) == 0: return result arcs = self.tree_parser.parse(words, pos_list) nodes = list(map(lambda x: (x.head, x.relation), arcs)) for token, pos, relation in zip(words, pos_list, nodes): if need_info: info = db.get_word(token) if info: category, norm_token, extra = info word_node = WordNode(token, pos, relation[1], category, norm_token, extra) else: word_node = WordNode(token, pos, relation[1]) else: word_node = WordNode(token, pos, relation[1]) result.append(word_node) return result def parse_tree(self, text, need_info=False): """ Parse the sentence as a dependence tree of word node :param need_info: whether need extra info :param text: raw string :return: a dependence tree of word node """ words = self.seg_token(text) pos = self.pos.postag(words) if len(words) == 0 or len(pos) == 0: return WordNode("", "", "", None) arcs = self.tree_parser.parse(words, pos) nodes = list(map(lambda x: (x.head, x.relation), arcs)) root_idx = find_x(nodes, 0) if need_info: info = db.get_word(words[root_idx[0]]) if info: category, norm_token, extra = info root = WordNode(words[root_idx[0]], pos[root_idx[0]], nodes[root_idx[0]][1], category, norm_token, extra) else: root = WordNode(words[root_idx[0]], pos[root_idx[0]], nodes[root_idx[0]][1]) else: root = WordNode(words[root_idx[0]], pos[root_idx[0]], nodes[root_idx[0]][1]) tree = {root_idx[0]: root} queue = root_idx while len(queue): next_idx = queue.pop() for idx in find_x(nodes, next_idx + 1): queue.insert(0, idx) if need_info: info = db.get_word(words[root_idx[0]]) if info: category, norm_token, extra = info new_node = WordNode(words[idx], pos[idx], nodes[idx][1], category, norm_token, extra) else: new_node = WordNode(words[idx], pos[idx], nodes[idx][1]) else: new_node = WordNode(words[idx], pos[idx], nodes[idx][1]) tree[next_idx].next.append(new_node) tree[idx] = new_node return root def extract(self, path): res = [] for rule in self.rules: window_size = len(rule.split(";")) if len(path) == window_size: if ";".join(map(lambda x: "%s,%s" % (x.relation, x.pos), path)) == rule: res.append(" ".join(map(lambda x: x.token, path))) else: for i in range(len(path) - window_size): p_slice = ";".join( map(lambda x: "%s,%s" % (x.relation, x.pos), path[i:i + window_size])) if p_slice == rule: res.append(" ".join( map(lambda x: x.token, path[i:i + window_size]))) break return res
######### LTP分词准备 ########## import os import re from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer LTP_DIR = ".../ltp_data" # 分词 # B词首,I词中,E词尾,S单字成词 segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 词性标注 postagger = Postagger() postagger.load(os.path.join(LTP_DIR, "pos.model")) # 依存句法分析 parser = Parser() parser.load(os.path.join(LTP_DIR, "parser.model")) # 命名实体识别 # O这个词不是NE, S这个词单独构成一个NE,B这个词为一个NE的开始,I这个词为一个NE的中间,E这个词为一个NE的结尾 # Nh人名,Ni机构名,Ns地名 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(LTP_DIR, "ner.model")) ######### 长句切分 ########## # 输入内容 # 输出短句列表 # 将一个长句子去除干扰字符后按指定标点切分 def seg_long_sents(content): # 去除空格' ', 中文全角空格 '\u3000', 中文破折号'——' # 之后根据‘?’ ‘!’ '?' '!' '。' 换行 回车 切分长句
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 default_model_dir: str,ltp模型文件目录 """ default_user_dict_dir = '../../resource/' # 默认的用户词典目录,清华大学法律词典 default_model_dir = '../../model/' # ltp模型文件目录 def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir): self.default_user_dict_dir = user_dict_dir self.default_model_dir = model_dir # 初始化分词器 # pynlpir.open() # 初始化分词器 # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue with open(file_path, 'r', encoding='utf-8') as f: line = f.readline() while line: word = line.strip('\n').strip() jieba.add_word(word) # print(c_char_p(word.encode())) # pynlpir.nlpir.AddUserWord(c_char_p(word.encode())) line = f.readline() # 加载ltp模型 # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parse_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if postag_flag or ner_flag or parse_flag: print('load model failed!') def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode())) jieba.add_word(entity) # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 单个用户词加入示例 # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode())) # 单个用户词加入示例 # 分词,不进行词性标注 # lemmas = pynlpir.segment(sentence, pos_tagging=False) lemmas = jieba.lcut(sentence) # pynlpir.close() # 释放 return lemmas def postag(self, lemmas): """对分词后的结果进行词性标注 Args: lemmas: list,分词后的结果 entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() # 释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ post_tag = self.postagger.postag([ word, ]) return post_tag[0] def netag(self, words): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) # print('\t'.join(netags)) # just for test words_netag = EntityCombine().combine(words, netags) # self.recognizer.release() # 释放 return words_netag def parse(self, words): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation # self.parser.release() return SentenceUnit(words) def close(self): """关闭与释放nlp""" # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release()
class Parser: def __init__(self): os.environ['STANFORD_PARSER'] = STANFORD_PARSER_PATH os.environ['STANFORD_MODELS'] = STANFORD_MODELS_PATH os.environ['JAVAHOME'] = JAVA_HOME stanford_model_path = CHINESE_MODEL_PATH self.s_parser = stanford.StanfordParser(model_path=stanford_model_path) par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` from pyltp import Parser self.parser = Parser() # 初始化实例 self.parser.load(par_model_path) # 加载模型 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` from pyltp import Segmentor self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` from pyltp import NamedEntityRecognizer self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 q_words = { 'q1_person': ['谁', '那个', '哪个'], 'q1_time': ['那年', '时间', '哪年', '何时', '多久', '时候', '年'], 'q1_amount': ['多', '几', '多少', '第几'], 'q1_place': ['哪儿', '哪家', '哪里人', '哪里', '那家', '那里人', '那里'], 'q1_result': ['怎么', '为什么', '为何', '如何', '何'], 'q1_judge': ['是否', '还是', '吗'], 'q0_other': ['哪些', '那些', '干什么'], 'q0_definition': ['什么样', '什么', '怎么样', '怎样'], } self.question_words = [] self.word2key = {} for k, v in q_words.items(): self.question_words += v for _v in v: self.word2key[_v] = k self.stop_words = set() with open('../data/all-stop-word.txt') as f_stop: for i in f_stop.readlines(): self.stop_words.add(i.strip()) self.articles = [] def cut_sentence(self, sent, stop=False): """ 句子分词 :param sent: :param stop: :return: """ if stop: words = list( filter(lambda x: x not in self.stop_words, list(self.segmentor.segment(sent.strip())))) else: words = list(self.segmentor.segment(sent.strip())) return words def get_question_type(self, question): """ 获取问题类型 :param question: :return: """ q_type = '' words = self.cut_sentence(question) flag = False for w in self.question_words: if w in words: flag = True q_type = self.word2key[w] break if not flag: # print(i, words) q_type = 'other' print(q_type) def word_count(self, sentences): """ 篇章中的词频统计 :param sentences: 句子列表 :return: """ all_words = [] for i in sentences: all_words += self.cut_sentence(i, True) word_count = {} for i in all_words: if i in word_count: word_count[i] += 1 else: word_count[i] = 1 return word_count, sum(word_count.values()) def read_train_set(self, file_path): """ 读取测试文件 :param file_path: 文件路径 :return: """ with open(file_path) as f_in: last_q = '' article = {'question': '', 'result': '', 'sentences': []} for i in f_in.readlines(): line = i.strip().split('\t') if last_q == line[1]: article['sentences'].append(line[2]) if int(line[0]) == 1: article['result'] = line[2] else: self.articles.append(article) article = { 'question': line[1], 'result': '', 'sentences': [] } last_q = line[1] self.articles.append(article) self.articles = self.articles[1:] print(len(self.articles)) print(self.articles[0]) def tf_idf(self): with open('../data/question_word.txt') as f_in: pass def analysis_question(self, index, debug=True): if len(self.articles) <= 0: return article = self.articles[index] q_words = self.cut_sentence(article['question'], True) true_result = ''.join(self.cut_sentence(article['result'], True)) if debug: print('q', self.cut_sentence(article['question'], True)) print('q', article['question']) print('a', self.cut_sentence(article['result'], True)) print('a', true_result) # print(q_words) # 候选答案句切词 l_words = [ self.cut_sentence(line, True) for line in article['sentences'] ] # 计算关键词idf idf = {} for word in q_words: count = 0 for line in l_words: if word in line: count += 1 idf[word] = count idf = { k: math.log(len(l_words) * 1.0 / (v + 1)) if len(l_words) > 0 else 0 for k, v in idf.items() } # print(idf) line2score = {} for line in l_words: score = 0 for word in q_words: # 计算关键词tf tf = 0 delta = 1 if len(re.findall('\d+', word)) > 0: delta = 3 for i in line: if i == word: tf += 1 if len(line) == 0: tf = 0 else: tf = (tf * 1.0 * delta) / len(line) score += tf * idf[word] line2score[''.join(line)] = score res = sorted(line2score.items(), key=lambda x: x[1], reverse=True) if debug: for i in res: print(i[1], i[0]) if len(res) > 0: for i in range(len(res)): if res[i][0] == true_result: return i + 1 return 0 else: return 0
class PreTrain(object): relationsMapping = { 'other': 0, 'locaA': 1, 'locAa': 2, 'med-ill': 3, 'ill-med': 4, "clsaA": 5, "clsAa": 6, "w-c": 7, "c-w": 8, "cs-ef": 9, "ef-cs": 10 } distanceMapping = {'PADDING': 0, 'LowerMin': 1, 'GreaterMax': 2} minDistance = -100 maxDistance = 100 maxSentenceLen = 100 max_distance = 204 parser = Parser() # 初始化实例 def __init__(self, w2vmodel_path): self.parser.load("../LTP/parser.model") # 加载模型 self.model = models.Word2Vec.load(w2vmodel_path) self.model_vocab = self.model.wv.vocab self.model_embedding = self.model.wv.get_keras_embedding(False) for dis in range(self.minDistance, self.maxDistance + 1): self.distanceMapping[dis] = len(self.distanceMapping) print(len(self.distanceMapping)) def load_w2vEmb(self): return self.model def sentence_w2v(self, pos1, pos2, sentence): pos1 = int(pos1) pos2 = int(pos2) sdp = np.zeros(self.maxSentenceLen, dtype=np.float32) tokenidxs = np.zeros(self.maxSentenceLen) positionValues1 = np.zeros(self.maxSentenceLen) positionValues2 = np.zeros(self.maxSentenceLen) tokens = str(sentence).split(" ") words = tokens.copy() flags = [] slen = len(tokens) for idx in range(0, slen): sdp[idx] = 0.3 distance1 = idx - int(pos1) distance2 = idx - int(pos2) if distance1 in self.distanceMapping: positionValues1[idx] = self.distanceMapping[distance1] elif distance1 <= self.minDistance: positionValues1[idx] = self.distanceMapping['LowerMin'] else: positionValues1[idx] = self.distanceMapping['GreaterMax'] if distance2 in self.distanceMapping: positionValues2[idx] = self.distanceMapping[distance2] elif distance2 <= self.minDistance: positionValues2[idx] = self.distanceMapping['LowerMin'] else: positionValues2[idx] = self.distanceMapping['GreaterMax'] if idx == pos1 or idx == pos2: flags.append("kej") else: flags.append(pseg.lcut(tokens[idx])[0].flag) if not self.model.__contains__(tokens[idx]): temp = jieba.lcut(tokens[idx]) tokens[idx] = temp[len(temp) - 1] if not self.model.__contains__(tokens[idx]): # print(str(idx) + " " + str(tokens)) # print(tokens[idx]) tokens[idx] = 'UNKNOWN_WORD' tokenidxs[idx] = self.model_vocab[tokens[idx]].index arcs = self.parser.parse(words, flags) # 句法分析 # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # for i in range(len(words)): # print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation) iter_idx = pos1 while True: if arcs[iter_idx].relation != "HED": sdp[iter_idx] = 0.8 iter_idx = (arcs[iter_idx].head - 1) else: sdp[iter_idx] = 0.8 break iter_idx = pos2 while True: if arcs[iter_idx].relation != "HED": sdp[iter_idx] = 0.8 iter_idx = (arcs[iter_idx].head - 1) else: sdp[iter_idx] = 0.8 break # for i in range(len(words)): # print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation + " " + str(sdp[i])) return tokenidxs, positionValues1, positionValues2, sdp # def sentence_w2v(self, pos1, pos2, sentence): # pos1 = int(pos1) # pos2 = int(pos2) # sdp = np.zeros(self.maxSentenceLen, dtype=np.float32) # tokenidxs = np.zeros(self.maxSentenceLen) # positionValues1 = np.zeros(self.maxSentenceLen) # positionValues2 = np.zeros(self.maxSentenceLen) # tokens = str(sentence).split(" ") # # words 是 tokens 的副本 # words = tokens.copy() # flags = [] # slen = len(tokens) # for idx in range(0, slen): # sdp[idx] = 0.3 # distance1 = idx - int(pos1) # distance2 = idx - int(pos2) # if distance1 in self.distanceMapping: # positionValues1[idx] = self.distanceMapping[distance1] # elif distance1 <= self.minDistance: # positionValues1[idx] = self.distanceMapping['LowerMin'] # else: # positionValues1[idx] = self.distanceMapping['GreaterMax'] # # if distance2 in self.distanceMapping: # positionValues2[idx] = self.distanceMapping[distance2] # elif distance2 <= self.minDistance: # positionValues2[idx] = self.distanceMapping['LowerMin'] # else: # positionValues2[idx] = self.distanceMapping['GreaterMax'] # # if idx == pos1 or idx == pos2: # flags.append("kej") # else: # flags.append(pseg.lcut(tokens[idx])[0].flag) # # if not self.model.__contains__(tokens[idx]): # temp = jieba.lcut(tokens[idx]) # tokens[idx] = temp[len(temp) - 1] # if not self.model.__contains__(tokens[idx]): # # print(str(idx) + " " + str(tokens)) # # print(tokens[idx]) # tokens[idx] = 'UNKNOWN_WORD' # tokenidxs[idx] = self.model_vocab[tokens[idx]].index # # arcs = parser.parse(words, flags) # 句法分析 # # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # # for i in range(len(words)): # # print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation) # iter_idx = pos1 # while True: # if arcs[iter_idx].relation != "HED": # sdp[iter_idx] = 0.8 # iter_idx = (arcs[iter_idx].head - 1) # else: # sdp[iter_idx] = 0.8 # break # iter_idx = pos2 # while True: # if arcs[iter_idx].relation != "HED": # sdp[iter_idx] = 0.8 # iter_idx = (arcs[iter_idx].head - 1) # else: # sdp[iter_idx] = 0.8 # break # # # for i in range(len(words)): # # print(str(i + 1) + " " + words[i] + " " + flags[i] + " " + str(arcs[i].head) + ":" + arcs[i].relation + " " + str(sdp[i])) # return tokenidxs, positionValues1, positionValues2, sdp # def get_max_sentence_len(self,lines): # maxlen = 0 # for line in lines: # splits = line.strip().split('\t') # sentence = splits[3] # tokens = sentence.split(" ") # maxlen = max(maxlen, len(tokens)) # return maxlen def process_one_input(self, input): temps = str(input).split("\t") relation = temps[0] pos1 = temps[1] pos2 = temps[2] sentence = temps[3].strip() tokenidxs, positionValues1, positionValues2, sdp = self.sentence_w2v( pos1, pos2, sentence) return self.relationsMapping[ relation], tokenidxs, positionValues1, positionValues2, sdp def process_file(self, file, saveFlag=True, savepath='../pkl/sem-relations.pkl.gz'): relationidxs = [] positionMatrix1 = [] positionMatrix2 = [] tokenMatrix = [] sdpMatrix = [] with codecs.open(file, "r", "utf8") as rd: lines = rd.readlines() # self.maxSentenceLen = self.get_max_sentence_len(lines) for line in lines: #检查长度 if len(line.split("\t")[3].split(" ")) > self.maxSentenceLen: print("超过长度") continue relationidx, tokenidxs, positionValues1, positionValues2, sdp = self.process_one_input( line) relationidxs.append(relationidx) positionMatrix1.append(positionValues1) positionMatrix2.append(positionValues2) tokenMatrix.append(tokenidxs) sdpMatrix.append(sdp) relationidxs = np.asarray(relationidxs, dtype='int32') positionMatrix1 = np.asarray(positionMatrix1, dtype='int32') positionMatrix2 = np.asarray(positionMatrix2, dtype='int32') tokenMatrix = np.asarray(tokenMatrix, dtype='int32') sdpMatrix = np.asarray(sdpMatrix, dtype='float32') if saveFlag: self.save_pkl(relationidxs, positionMatrix1, positionMatrix2, tokenMatrix, sdpMatrix, savepath) return relationidxs, positionMatrix1, positionMatrix2, tokenMatrix def save_pkl(self, relationidxs, positionMatrix1, positionMatrix2, tokenMatrix, sdpMatrix, save_path): data = { 'relationidxs': relationidxs, 'positionMatrix1': positionMatrix1, 'positionMatrix2': positionMatrix2, 'tokenMatrix': tokenMatrix, "sdpMatrix": sdpMatrix } f = gzip.open(save_path, 'wb') pkl.dump(data, f) f.close() def process_one(self, line): # self.maxSentenceLen = 78 if len(line.split("\t")[3].split(" ")) > self.maxSentenceLen: print("超过长度") return None relationidx, tokenidxs, positionValues1, positionValues2 = self.process_one_input( line) relationidx = np.asarray(relationidx, dtype='int32') positionMatrix1 = np.asarray(positionValues1, dtype='int32') positionMatrix2 = np.asarray(positionValues2, dtype='int32') tokenMatrix = np.asarray(tokenidxs, dtype='int32') tokenMatrix = tokenMatrix.reshape((1, self.maxSentenceLen)) positionMatrix1 = positionMatrix1.reshape((1, self.maxSentenceLen)) positionMatrix2 = positionMatrix2.reshape((1, self.maxSentenceLen)) return relationidx, positionMatrix1, positionMatrix2, tokenMatrix # pre = PreTrain("../w2vmodel/word2vec2.model") # pre.process_file("../files/train.txt",True,'../pkl/train2.pkl.gz') # pre.sentence_w2v(2,4,"入宫 为 魏孝文帝 和 文明太后 治过 病 , 多有 疗效")
from pyltp import SentenceSplitter from scipy.spatial.distance import cosine from bert_serving.client import BertClient cws_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'cws.model') pos_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'pos.model') par_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'parser.model') ner_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'ner.model') say_words_path = os.path.join(os.path.abspath('./'), 'data', 'saying_words.pickle') segmentor = Segmentor() # 分词 postagger = Postagger() # 词性标注 recognizer = NamedEntityRecognizer() # 命名主体识别 parser = Parser() # 依存分析 segmentor.load(cws_model_path) postagger.load(pos_model_path) recognizer.load(ner_model_path) parser.load(par_model_path) # load saying words say_words = pickle.load(open(say_words_path, 'rb')) # 句子依存分析 def parsing(sentence): words = segmentor.segment(sentence) # pyltp分词 postags = postagger.postag(words) # 词性标注 arcs = parser.parse(words, postags) # 句法分析
class LtpParser: def __init__(self): LTP_DIR = "/Users/rilzob/PycharmProjects/SubjectKG/ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) # self.labeller.release() # 释放模型 roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] # print('words:', words) # print('postags:', postags) # print('arcs:', arcs) for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): # print('arc_index:', arc_index) if arcs[arc_index].head == index + 1: #arcs的索引从1开始 # print('arc_index.relation:', arcs[arc_index].relation) if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) # print('child_dict:', child_dict) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) # segment是将句子分词后的返回值并且使用list转换为Python的列表类型,原类型为native的VectorOfString postags = list(self.postagger.postag(words)) # postag是将words进行词性标注的返回结果 arcs = self.parser.parse(words, postags) # parse是进行依存句法分析 child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) # # 较原来的版本修改的部分 # old_child_dict_list, old_format_parse_list = self.build_parse_child_dict(words, postags, arcs) # # print('child_dict_list:', child_dict_list) # # print('format_parse_list:', format_parse_list) # new_format_parse_list = old_format_parse_list # # # 找到中心词在old_format_parse_list的index # hed_num = 0 # 中心词的index # for format_parse in old_format_parse_list: # if old_format_parse_list[0] == 'HED': # hed_num = format_parse[2] # else: # continue # # # 找到被中心词所支配的主语 # subject = '' # 中心词的从属词 # for format_parse in old_format_parse_list: # if format_parse[0] == 'SBV' and format_parse[5] == hed_num: # subject = old_format_parse_list[1] # else: # continue # # # 对原文进行修改,增加主语 # for format_parse in old_format_parse_list: # if format_parse[0] == 'ADV': # if old_format_parse_list[format_parse[5]][0] == 'COO': # new_format_parse_list.insert(format_parse[2], list(subject)) # else: # continue # # # # for roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list def supply_subject(self, old_format_parse_list): # 较原来的版本修改的部分 # print('child_dict_list:', child_dict_list) # print('format_parse_list:', format_parse_list) new_format_parse_list = old_format_parse_list # 找到中心词在old_format_parse_list的index hed_num = 0 # 中心词的index for old_format_parse in old_format_parse_list: if old_format_parse[0] == 'HED': hed_num = old_format_parse[2] else: continue # 找到被中心词所支配的主语 subject = '' # 中心词的从属词 for old_format_parse in old_format_parse_list: if old_format_parse[0] == 'SBV' and old_format_parse[5] == hed_num: subject = old_format_parse[1] else: continue # 对原文进行修改,增加主语 for old_format_parse in old_format_parse_list: if old_format_parse[0] == 'ADV': if old_format_parse_list[old_format_parse[5]][0] == 'COO': new_format_parse_list.insert(old_format_parse[2], list(('', subject))) else: continue # 生成补充主语后的新句子 string = '' for new_format_parse in new_format_parse_list: string = string + new_format_parse[1] return string
class LtpParser: def __init__(self): #initialize every ltp tool LTP_DIR = "E:\code_Athena_Support" #分词器 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) #依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) #语义角色标注模块 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): #依赖于词性的标注,做依存句法的分析 #解释: #依存句法分析是基于词性标注的。 arcs = self.parser.parse(words, postags) #根据依存句法的分析,标注语义角色 roles = self.labeller.label(words, postags, arcs) #以字典储存,key为编号,value为列表 #而且是嵌套字典,以arg.name作为key #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } print(roles_dict) return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): #其数据结构是: #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子 child_dict_list = [] #这个list的意义就是展示每个词的依存关系 format_parse_list = [] #一级循环:对每个词分析 for index in range(len(words)): #预设孩子字典 child_dict = dict() #二级循环:查每个词的语义角色 for arc_index in range(len(arcs)): #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下 if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): '''显然这是一个类的主函数''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
class LtpParser: def __init__(self): LTP_DIR = "D:\ltp_data" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: # arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
import os #import pandas as pd from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer # Set your own model path MODELDIR = "ltp_data" print "正在加载LTP模型... ..." segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) print "加载模型完毕。" in_file_name = "input.txt" out_file_name = "outlgh.txt" begin_line = 1 end_line = 0 #df=pd.DataFrame(columns=["逻辑关系","实体一","关系","实体二"]) def extraction_start(in_file_name, out_file_name, begin_line, end_line):
def answersemantic(resultwordlist, resultposlist): # 根据ltp进行句法分析,转换为 postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') postagger.load(pos_model_path) # 加载模型 parser = Parser() # 初始化实例 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') parser.load(par_model_path) # 加载模型 postags = postagger.postag(resultwordlist) # 词性标注'' poslist = [] for i in postags: poslist.append(str(i)) print(poslist) arcs = parser.parse(resultwordlist, poslist) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) arcshead = [] arcsrela = [] for i in arcs: arcshead.append(i.head) arcsrela.append(i.relation) print(arcshead) print(arcsrela) semanticlist = [] length = len(resultwordlist) poedictlist = [] quenum = -1 for i in range(0, len(resultposlist)): if resultposlist[i] == "question": quenum = i print("resultposlist,resultwordlist: ", resultwordlist, resultposlist) for i in range(0, length): if resultposlist[i] in nertypelist: num = findproperty(i, arcshead, arcsrela, resultposlist) if num != -1: # resultposlist[arcshead[i]-1]=="property":#战狼2的上映日期是什么时候 mov的属性是 # if arcsrela[i]=="ATT" or arcsrela[i]=="SBV": poedict = {} poedict["headnode"] = resultwordlist[i] poedict["headnodetype"] = resultposlist[i] if quenum == -1: questr = "" else: questr = questiondict[resultwordlist[quenum]] properresult = getrelation(propertydict[resultwordlist[num]], resultposlist[i], questr) endnodetype = getnodetype(propertydict[resultwordlist[num]], resultposlist[i], questr) poedict["relation"] = properresult poedict["endnode"] = "" poedict["endnodetype"] = endnodetype poedict["quesion"] = questr poedictlist.append(poedict) print(poedictlist) postagger.release() # 释放模型 parser.release() # 释放模型 return poedictlist
class MyLTP(): def __init__(self): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) # sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path self.MODELDIR = os.path.join(ROOTDIR, "./ltp_data") # Init LTP Model self.segmentor = Segmentor() self.postagger = Postagger() self.parser = Parser() self.recognizer = NamedEntityRecognizer() self.labeller = SementicRoleLabeller() self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) self.parser.load(os.path.join(self.MODELDIR, "parser.model")) self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) self.labeller.load(os.path.join(self.MODELDIR, "pisrl.model")) # 下述函数返回值均为 list, list[0] 为第一个句子的运行结果 # ---------------------------- 分词 ------------------------------- def MySegmentor(self, paragraph): # 段落分成句子 sentences = SentenceSplitter.split(paragraph) result = [] for sentence in sentences: words = self.segmentor.segment(sentence) # 输出 # print("\t".join(words)) result.append(words) return result # ---------------------------- 词性标注 ------------------------------- def MyPostagger(self, words): result = [] for word in words: postags = self.postagger.postag(word) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # 输出 # print("\t".join(postags)) result.append(postags) return result # ---------------------------- 依存句法分析 ------------------------------- def MyParser(self, words, postags): result = [] for index in range(0, len(words)): arcs = self.parser.parse(words[index], postags[index]) # 输出 # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) result.append(arcs) return result # ---------------------------- 命名实体识别 ------------------------------- def MyRecognizer(self, words, postags): result = [] for index in range(0, len(words)): netags = self.recognizer.recognize(words[index], postags[index]) # 输出 # print("\t".join(netags)) result.append(netags) return result # ---------------------------- 语义角色标注 ------------------------------- def MyRoleLabller(self, words, postags, arcs): result = [] for index in range(0, len(words)): roles = self.labeller.label(words[index], postags[index], arcs[index]) # 输出 # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) result.append(roles) return result
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity("../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag = self.segmentor.load_with_lexicon(os.path.join(default_model_dir, 'cws.model'), user_dict) # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag: # 可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): words = self.segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def splitSentence(self,text): pattern = r'。|!|?|;|=' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) # print(result_list) return result_list def splitSentenceByComma(self,text): pattern = r',' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) final_list = [] for sentence in result_list: if len(sentence) <= 40: final_list.append(sentence) return final_list def not_empty(self,s): return s and "".join(s.split()) def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ location_position_list = ['主席','总统','总理','主任','内','东门','西门','南门','北门','大门','外','国家主席','尚书' ] if self.dsfnConstraints3(sentence,item1,item2) and (item1.dependency == "ATT" and item1.head_word.postag != 'v' and item1.head_word.postag != 'a'): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT" and AttWord.head_word.postag != 'v' and AttWord.head_word.postag != 'a' ): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and (item.dependency == "ATT" and item.postag != 'v' and item.postag != 'a'): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") if AttWordStr in location_position_list: allTripes.append([item1.lemma, AttWordStr, item2.lemma]) """ 考虑DSFN2的情况 """ if item1.dependency == "SBV" and item1.head_word.postag == "v": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB" and item2.head_word.postag == "v": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) ==2: num = self.get_entity_num_between(pred1,pred2,sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred1.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID): flagVOB = False flagCMP= True if pred1!=None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID: flagCMP = False if pred2!=None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID: flagCMP = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0 : if flagCMP == False : if flagVOB == True and flagSBV == True: allTripes.append([item1.lemma, pred1.lemma + "" +pred2.lemma, item2.lemma]) else: if flagVOB == True: allTripes.append([item1.lemma, pred1.lemma, item2.lemma]) if flagSBV == True: allTripes.append([item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma # print( # "DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + objectStr, item2.lemma]) # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + object.lemma + "," + item2.lemma + ")") # allTripes.append([item1.lemma, pred.lemma + "" + object.lemma, item2.lemma]) if object == None: # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma , item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP": pred2 = prep.head_word if pred2.ID == pred.ID: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma]) else : num = self.get_entity_num_between(pred, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID): flagVOB = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0 : flag = True for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1 != None: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") if flagSBV == True: allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma]) else: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma, item2.lemma]) if flagSBV == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self,verb1,verb2,sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID-1: if self.is_entity(sentence.words[i]): num +=1 i +=1 return num def is_entity(self,entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh','ni','ns','nz','j','n','v'] # print(entry.lemma+" : "+entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self,sentence,item1,item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1,item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence,item1COO,item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self,sentence,item1,item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence,item1,item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self,sentence,item1,item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1COO,item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStart(self, rawSentence, entity1, entity2,all_entity): nounRelatedWithPosition = ['主席','总理','教授','校长'] resultList = [] lemmas = dsfn.segment(rawSentence) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) # print(sentence.to_string()) Rawitem1 = None Rawitem2 = None item1 = None item2 = None Rawitem1Index = -1 Rawitem2Index = -1 indexList = [-1,-1] for item in sentence.words: if (item.lemma == entity1): Rawitem1 = item if (item.lemma == entity2): Rawitem2 = item if Rawitem1 != None and Rawitem2 != None and (Rawitem1.ID!=Rawitem1Index or Rawitem2.ID!=Rawitem2Index): Rawitem1Index = Rawitem1.ID Rawitem2Index = Rawitem2.ID # if item1 == None or item2 == None: # return None item1 = Rawitem1 item2 = Rawitem2 if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c # print(str(item1.ID) + " " + str(item2.ID)) itemCopy1 = item1 itemCopy2 = item2 # print(item1.lemma) # print(item2.lemma) # print(self.dsfnConstraints2(sentence,item1,item2,all_entity)) if self.dsfnConstraints2(sentence,item1,item2,all_entity) == False: continue allTripes = self.dsfnStartCOO2(sentence,item1,item2) # print("111"+item2.lemma) if allTripes!=None and len(allTripes) == 0: while item1.dependency == "ATT": item1 = item1.head_word while item2.dependency == "ATT": item2 = item2.head_word allTripes = self.dsfnStartCOO2(sentence, item1, item2) if len(allTripes) != 0: for tripe in allTripes: if tripe[1]!= "": if tripe[0] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[0] = item1.lemma+""+itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[0] = itemCopy1.lemma+""+item1.lemma else: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[2] = item1.lemma+""+itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[2] = itemCopy1.lemma+""+item1.lemma else: tripe[2] = itemCopy1.lemma # tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: if item2.ID < itemCopy2.ID: tripe[0] = item2.lemma + ""+ itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[0] = itemCopy2.lemma + ""+ item2.lemma else: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: # print(item2.lemma) if item2.ID < itemCopy2.ID: tripe[2] = item2.lemma + ""+ itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[2] = itemCopy2.lemma + ""+ item2.lemma else: tripe[2] = itemCopy2.lemma # print("12345") resultList.append(tripe) else: for tripe in allTripes: if tripe[1]!="": resultList.append(tripe) # if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) if item1 == None or item2 == None: return None if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) # print("输出结果1"+str(resultList)) return resultList def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print(np.array(set([tuple(t) for t in resultList]))) return resultList def dsfnConstraints1(self,rawSentence,maxLength): """ :param rawSentence: 原句子 :param maxLength: 句子的最大长度 :return: 小于maxLength的长度 """ newSentence = [] if len(rawSentence) <= maxLength: newSentence.append(rawSentence) return newSentence else: newSentence = self.splitSentenceByComma(rawSentence) return newSentence def dsfnConstraints2(self,sentence,item1,item2,allEntities): countEntity = 0 countChar = 0 for index in range(item1.ID+1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if word.lemma in allEntities: countEntity +=1 # print(countEntity) # print(countChar) if countEntity > 3: return False elif countChar > 12: return False else: return True def dsfnConstraints3(self,sentence,item1,item2): countChar = 0 for index in range(item1.ID+1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if countChar > 5: return False else: return True def getSPO(self,sentence): all_result = [] raw_sentence = [] RawSentence = sentence lemmas = self.segment(sentence) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) print(sentence.to_string()) for itemWord in sentence.words: #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系 if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\ or (itemWord.postag == "v"): relation_verb = itemWord #将找到的这个动词,作为relation_verb relationString = relation_verb.lemma if itemWord.head_word==None: verbId = itemWord.ID #关系动词的ID verbId2 = None elif itemWord.head_word.head_word == None: verbId = itemWord.ID #该关系动词的ID verbId2 = itemWord.head_word.ID #这句话的HED,用来找SUB else: verbId = itemWord.ID #该关系动词的ID verbId2 = None O_dict = dict() #存储所有的Object S_dict = dict() #存储所有的Subject verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲 OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语 # if SUB == None or SUB.lemma != entity: SUB = item #找到主语 S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中 if (item.dependency == "VOB" and item.head_word.ID == verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verbId) : # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma + "" + item.head_word.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma for eachWord in sentence.words: if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID: relationString = relation_verb.lemma + "" + eachWord.lemma verb_dict[OBJ.ID] = relationString if SUB == None:#如果没找到主语,那么就找与该动词并列的verbId2的主语 for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if OBJ == None: verb_coo = None for item in sentence.words: if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId: verb_coo = item break flag = True if verb_coo != None and self.get_entity_num_between(relation_verb,verb_coo,sentence) == 0: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID: flag = False if flag!= False: for item in sentence.words: if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID): OBJ = item O_dict[OBJ.ID] = OBJ.lemma print(verb_dict) print(O_dict) SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict: #获得主语的COO SUB_COO = item S_dict[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ!=None: if item.dependency == "COO" and item.head_word.ID in O_dict: #获得宾语的COO OBJ_COO = item O_dict[OBJ_COO.ID] = OBJ_COO.lemma S_new = [] for sub in S_dict: if sentence.get_word_by_id(sub).postag == 'r': continue S_dict2 = dict() # 存放主语ATT的列表 S_dict2[sub] = S_dict[sub] flag = True while flag == True: len1 = len(S_dict2) for item in sentence.words: if item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and (item.dependency == "ATT" or item.dependency == "ADV"): SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if len(S_dict2) != len1 : flag = True else: flag = False S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) Subject = "" for i in S_dict2: Subject += i[1] S_new.append(Subject) O_new = [] V_new = [] for obj in O_dict: if sentence.get_word_by_id(obj).postag == 'r': continue O_dict2 = dict() # 存放宾语ATT的列表 O_dict2[obj] = O_dict[obj] if verb_dict!=None: if obj in verb_dict: relationString2 = verb_dict[obj] else: relationString2 = relation_verb.lemma else: relationString2 = relation_verb.lemma V_new.append(relationString2) flag = True while flag == True: len2 = len(O_dict2) for item in sentence.words: if item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and (item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB"): OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma if len(O_dict2) != len2: flag = True else: flag = False #一直循环,直到找不到新的修饰词 O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) Object = "" for i in O_dict2: Object += i[1] O_new.append(Object) print(O_dict) print(O_new) for sub in S_new: for i in range(0,len(O_new)): obj = O_new[i] relationWord = V_new[i] if obj != "": # print(RawSentence) # print((sub, relationWord, obj)) all_result.append([sub,relationWord,obj]) raw_sentence.append(RawSentence) return all_result,raw_sentence def hasEntity(self,word,allEntity): for entity in allEntity: if entity in word: # print(entity) return True return False def PostProcessSPO(self,rawSentence,allTripes,allEntity): output_list = [] for i in range(0,len(allTripes)): tripe = allTripes[i] sub = tripe[0] obj = tripe[2] # print(sub) # print(obj) if self.hasEntity(sub,allEntity) and self.hasEntity(obj,allEntity): output_list.append(tripe) return output_list
#!/usr/bin/env python # -*- coding: utf-8 -*- from pyltp import Segmentor from pyltp import Postagger from pyltp import Parser import name_convert as nc import title_to_tree as tot import predict as pt segmentor = Segmentor() segmentor.load_with_lexicon("./system/ltp_data/cws.model","./system/ltp_data/plain.txt") postagger = Postagger() postagger.load_with_lexicon("./system/ltp_data/pos.model","./system/ltp_data/postagger.txt") parser = Parser() parser.load("./system/ltp_data/parser.model") def get_result(company_name,news_titles): title_tree = [] for sentence in news_titles: words = segmentor.segment(sentence) words = nc.convert(words) if company_name not in words: add_word = [company_name, u':'.encode('utf8')] add_word.extend(words) words = add_word # print ("\t".join(words)) postags = postagger.postag(words) # print ("\t".join(postags)) arcs = parser.parse(words,postags)
class Extractor(): def __init__(self): self.__clause_list = [] self.__subclause_dict = {} self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__labeller = SementicRoleLabeller() self.__words_full_list = [] self.__netags_full_list = [] @property def clause_list(self): return self.__clause_list @property def triple_list(self): return self.__triple_list def split(self, words, postags): start = 0 for j, w in enumerate(words): if w == ',' or w == ',' or w == '。': clause = Clause(start, j - 1) self.__clause_list.append(clause) start = j + 1 for clause in self.__clause_list: clause.split(postags) for subclause in clause.sub_clause_list: self.add_inverted_idx(subclause) def add_inverted_idx(self, subclause): for i in range(subclause.start_idx, subclause.end_idx): self.__subclause_dict[i] = subclause def load(self): self.__segmentor.load('ltp_data/cws.model') self.__postagger.load('ltp_data/pos.model') self.__recognizer.load('ltp_data/ner.model') self.__parser.load('ltp_data/parser.model') self.__labeller.load('ltp_data/srl') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() self.__labeller.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def resolve_conference(self, entity): try: e_str = entity.get_content_as_str() except Exception: return '?' ref = e_str if e_str == '他' or e_str == '她': for i in range(entity.loc, -1, -1): if self.__netags_full_list[i].lower().endswith('nh'): ref = self.__words_full_list[i] break return ref def resolve_all_conference(self): for t in self.triple_list: e_str = self.resolve_conference(t.entity_1) try: t.entity_1.content = e_str.split() except Exception: pass def sentenceSplite(self, text): return SentenceSplitter.split(text) def chunk_str(self, data): sents = self.sentenceSplite(data) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) roles = self.__labeller.label(words, postags, netags, arcs) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) except Exception as e: print(str(e)) pass def chunk_sent(self, words, postags, arcs, offset): root = [i for i, x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [ i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO' ] relations.insert(0, root) prev_e1 = None e1 = None for rel in relations: left_arc = [ i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV' ] if len(left_arc) > 1: pass #raise Exception('More than 1 left arc is detected!') elif len(left_arc) == 0: e1 = prev_e1 elif len(left_arc) == 1: left_arc = left_arc[0] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost) prev_e1 = e1 right_arc = [ i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB' ] e2_list = [] if not right_arc: e2 = Entity(2, None) e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [ i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO' ] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity( 2, [words[i] for i in range(leftmost, right_ext + 1)], offset + leftmost) else: p1 = range(leftmost, right_arc[0]) p2 = range(item, find_farthest_vob(arcs, item) + 1) e2 = Entity( 2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) r = Relation(words[rel]) t = Triple(e1, e2, r) self.__triple_list.append(t) count += 1 def segment(self, sentence): words = self.__segmentor.segment(sentence) return words def postag(self, words): postags = self.__postagger.postag(words) return postags def parse(self, words, postags): arcs = self.__parser.parse(words, postags) return arcs def recognize(self, words, postags): netags = self.__recognizer.recognize(words, postags) return netags def label(self, words, postags, netags, arcs): roles = self.__labeller.label(words, postags, netags, arcs) return roles
format = 'json' pattern = 'all' result = urllib2.urlopen("%sapi_key=%s&text=%s&format=%s&pattern=%s" % (url_get_base,api_key,text,format,pattern)) content = result.read().strip() # print content return json.loads(content)[0] else: aa= [] return aa segmentor = Segmentor() segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt") postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) #分析每句 def callLTP(sentence): words = segmentor.segment(sentence) postags = postagger.postag(words) arcs = parser.parse(words, postags) resultJson=[] for index in range(len(words)): resultJson.append({'id':index,'cont':words[index],'pos':postags[index],'relate':arcs[index].relation,'parent':arcs[index].head - 1}) return resultJson #分析每行,调用callLTP
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 location_entity = [ "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿", "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫", "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清", "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河", "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫", "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝", "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林", "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城", "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛", "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城", "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山", "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山", "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼", "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川", "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼", "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街", "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门", "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海", "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石", "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔", "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿", "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿", "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪", "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼", "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方", "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛", "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平", "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头", "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁", "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园", "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿", "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿", "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜", "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫", "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛", "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙", "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫" ] def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor_user = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag_user = self.segmentor_user.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) self.segmentor = Segmentor() segmentor_flag = self.segmentor.load( os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag or segmentor_flag_user: # 可能有错误 print('load model failed') def segment(self, sentence, segmentor, entity_postag=dict()): words = segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def splitSentence(self, text): pattern = r'。|!|?|;|=' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) # print(result_list) return result_list def splitSentenceByComma(self, text): pattern = r',' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) final_list = [] for sentence in result_list: if len(sentence) <= 40: final_list.append(sentence) return final_list def not_empty(self, s): return s and "".join(s.split()) def dsfn1_2_3_4COO(self, sentence, item1, item2, flagCOOATT): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ # print(item1.lemma) # print(item2.lemma) # print(flagCOOATT) if flagCOOATT == False: location_position_list = getAttWord() # print(location_position_list) if self.dsfnConstraints3(sentence, item1, item2) and (item1.dependency == "ATT"): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # print(AttWord.lemma) # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT"): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and ( item.dependency == "ATT"): AttWordDict[item.ID] = item.lemma # print(item.lemma) if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") if AttWordStr in location_position_list: allTripes.append( [item1.lemma, AttWordStr, item2.lemma]) # print(allTripes) # print("-------------------------") # else: # for attWord in location_position_list: # if attWord in AttWordStr: # allTripes.append([item1.lemma, AttWordStr, item2.lemma]) # print(allTripes) # # print("-------------------------") """ 考虑DSFN2的情况 """ if item1.dependency == "SBV" and item1.head_word.postag == "v": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB" and item2.head_word.postag == "v": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) == 2: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred1.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID): flagVOB = False flagCMP = True if pred1 != None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID: flagCMP = False if pred2 != None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID: flagCMP = False flagCOO = True if pred1 != None and pred1.dependency == "COO" and pred1.head_word.ID == pred2.ID: flagCOO = False if pred2 != None and pred2.dependency == "COO" and pred2.head_word.ID == pred1.ID: flagCOO = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: if flagCMP == False: if flagVOB == True and flagSBV == True: allTripes.append([ item1.lemma, pred1.lemma + "" + pred2.lemma, item2.lemma ]) if flagCOO == False: if flagVOB == True and flagSBV == True: allTripes.append([ item1.lemma, pred1.lemma + "" + pred2.lemma, item2.lemma ]) else: if flagVOB == True: allTripes.append( [item1.lemma, pred1.lemma, item2.lemma]) if flagSBV == True: allTripes.append( [item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma allTripes.append([ item1.lemma, pred.lemma + "" + objectStr, item2.lemma ]) if object == None: hasPOB = False for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "POB" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == pred.ID: hasPOB = True allTripes.append([ item1.lemma, pred.lemma + "" + item.head_word.lemma + "" + item.lemma, item2.lemma ]) # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") if hasPOB == False: allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP": pred2 = prep.head_word if pred2.ID == pred.ID: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma ]) else: num = self.get_entity_num_between(pred, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID): flagVOB = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: flag = True for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1 != None: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma ]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") if flagSBV == True: allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) else: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) if flagSBV == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self, verb1, verb2, sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID - 1: if self.is_entity(sentence.words[i]): num += 1 i += 1 return num def is_entity(self, entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm'] # print(entry.lemma+" : "+entry.postag) if entry.postag in entity_postags: return True else: return False def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2, True) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self, sentence, item1, item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO, True) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO, True) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[ 2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[ 0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStart(self, rawSentence, segmentor, entity1, entity2, all_entity): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] lemmas = dsfn.segment(rawSentence, segmentor) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) # print(sentence.to_string()) Rawitem1 = None Rawitem2 = None item1 = None item2 = None Rawitem1Index = -1 Rawitem2Index = -1 indexList = [-1, -1] for item in sentence.words: # print(str(item.ID) + " " +item.lemma ) if (item.lemma == entity1): Rawitem1 = item if (item.lemma == entity2): Rawitem2 = item if Rawitem1 != None and Rawitem2 != None and ( Rawitem1.ID != Rawitem1Index or Rawitem2.ID != Rawitem2Index): Rawitem1Index = Rawitem1.ID Rawitem2Index = Rawitem2.ID # print(str(Rawitem1Index) +" " +str(Rawitem2Index)) # if item1 == None or item2 == None: # return None item1 = Rawitem1 item2 = Rawitem2 if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c # print(str(item1.ID) + " " + str(item2.ID)) itemCopy1 = item1 itemCopy2 = item2 # print(item1.lemma) # print(item2.lemma) # print(self.dsfnConstraints2(sentence,item1,item2,all_entity)) if self.dsfnConstraints2(sentence, item1, item2, all_entity) == False: continue allTripes = self.dsfnStartCOO2(sentence, item1, item2, False) if allTripes != None: for tripe in allTripes: if tripe[1] != "": resultList.append(tripe) if item1 == None or item2 == None: return None if len(resultList) > 0: return resultList def dsfnStartCOO2(self, sentence, item1, item2, flagCOOATT): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2, flagCOOATT) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item # print(sentence.to_string()) # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO( sentence, item1, item2, flagCOOATT) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO( sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO( sentence, item1, item2) if allTripes == None or len( allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO( sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print(np.array(set([tuple(t) for t in resultList]))) return resultList def dsfnConstraints1(self, rawSentence, maxLength): """ :param rawSentence: 原句子 :param maxLength: 句子的最大长度 :return: 小于maxLength的长度 """ newSentence = [] if len(rawSentence) <= maxLength: newSentence.append(rawSentence) return newSentence else: newSentence = self.splitSentenceByComma(rawSentence) return newSentence def dsfnConstraints2(self, sentence, item1, item2, allEntities): countEntity = 0 countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if word.lemma in allEntities: countEntity += 1 # print(countEntity) # print(countChar) if countEntity > 3: return False elif countChar > 12: # print(countChar) return False else: return True def dsfnConstraints3(self, sentence, item1, item2): countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if countChar > 5: return False else: return True def getSPO(self, sentence, segmentor): all_result = [] raw_sentence = [] RawSentence = sentence lemmas = self.segment(sentence, segmentor) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) # print(sentence.to_string()) for itemWord in sentence.words: #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系 if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\ or (itemWord.postag == "v") : relation_verb = itemWord #将找到的这个动词,作为relation_verb relationString = relation_verb.lemma # print(relationString) if itemWord.head_word == None: # print("1") verbId = itemWord.ID # 关系动词的ID verbId2 = None elif itemWord.head_word.head_word == None: # print("2") verbId = itemWord.ID #该关系动词的ID if itemWord.dependency == "COO" or self.get_entity_num_between( itemWord, itemWord.head_word, sentence) == 0: verbId2 = itemWord.head_word.ID # 这句话的HED,用来找SUB else: verbId2 = None else: # print("3") verbId = itemWord.ID #该关系动词的ID if itemWord.dependency == "COO" or self.get_entity_num_between( itemWord, itemWord.head_word, sentence) == 0: verbId2 = itemWord.head_word.ID # 这句话的HED,用来找SUB else: verbId2 = None O_dict = dict() #存储所有的Object S_dict = dict() #存储所有的Subject verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲 OBJ = None SUB = None DSFN3 = dict() for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语 # if SUB == None or SUB.lemma != entity: SUB = item #找到主语 S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中 if (item.dependency == "VOB" and item.head_word.ID == verbId and item.postag != "v"): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma + "" + item.head_word.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \ and item.postag!='v'): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma verbObj = None DSFN3[OBJ.ID] = True objectDict = dict() relationString = relation_verb.lemma for eachWord in sentence.words: if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID: # relationString = relation_verb.lemma + "" + eachWord.lemma verbObj = eachWord objectDict[verbObj.ID] = verbObj if verbObj != None: for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma relationString = relation_verb.lemma + "" + objectStr else: for eachWord in sentence.words: if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\ eachWord.head_word.head_word.ID == relation_verb.ID: relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma verb_dict[OBJ.ID] = relationString if SUB == None: #如果没找到主语,那么就找与该动词并列的verbId2的主语 for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma # print(verbId2) if OBJ == None: verb_coo = None for item in sentence.words: if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId: verb_coo = item break flag = True if verb_coo != None and self.get_entity_num_between( relation_verb, verb_coo, sentence) == 0: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID: flag = False if flag != False: for item in sentence.words: if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID): OBJ = item O_dict[OBJ.ID] = OBJ.lemma # print(S_dict) # print(verb_dict) # print(O_dict) SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict: #获得主语的COO SUB_COO = item S_dict[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ != None: if item.dependency == "COO" and item.head_word.ID in O_dict: #获得宾语的COO OBJ_COO = item O_dict[OBJ_COO.ID] = OBJ_COO.lemma S_new = [] for sub in S_dict: S_new.append(S_dict[sub]) O_new = [] V_new = [] for obj in O_dict: if verb_dict != None: if obj in verb_dict: relationString2 = verb_dict[obj] else: relationString2 = relation_verb.lemma else: relationString2 = relation_verb.lemma V_new.append(relationString2) O_new.append(O_dict[obj]) for sub in S_new: for i in range(0, len(O_new)): obj = O_new[i] relationWord = V_new[i] if obj != "": all_result.append([sub, relationWord, obj]) raw_sentence.append(RawSentence) return all_result, raw_sentence def hasEntity(self, word, allEntity): for entity in allEntity: if entity in word: # print(entity) return True return False def PostProcessSPO(self, rawSentence, allTripes, allEntity): output_list = [] for i in range(0, len(allTripes)): tripe = allTripes[i] sub = tripe[0] obj = tripe[2] print(sub) print(obj) if self.hasEntity(sub, allEntity) and self.hasEntity( obj, allEntity): # print(sub) # print(obj) output_list.append(tripe) return output_list
sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles:
LTP_DATA_DIR = './ltp_data_v3.4.0' # ltp模型目录的路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` # srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model') # 语义角色标注模型目录路径 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 # labeller = SementicRoleLabeller() # 初始化实例 # labeller.load(srl_model_path) # 加载模型 def cht_to_chs(line): # 转换繁体到简体 line = Converter('zh-hans').convert(line) line.encode('utf-8') return line def data_prepare(sentences, labA, labT, labD): # 获取人工标注 wordList = [] labelList = []