def name_recognize_one(): import sys, os import pyltp from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '叙利亚东古塔地区。7日发生疑似化学武器袭击事件,导致70余人丧生。报道一出,叙利亚反对派、美国、英国、法国等纷纷指责叙政府军使用化学武器袭击无辜平民。但叙利亚坚决否认,并指责西方和叙反对派造谣,目的是保护被围困的恐怖分子。俄外交部则认为,该谣言旨在袒护恐怖分子,并为外部势力发动打击寻找借口。' sentence = SentenceSplitter.split(paragraph)[1] print('split {}'.format(sentence)) # 断句 # for i in sentence: # print(i) # print() segmentor = Segmentor() segmentor.load(sg_model_path) words = segmentor.segment(sentence) print('|'.join(words)) postagger = Postagger() postagger.load(ps_model_path) postags = postagger.postag(words) for k, v in dict(zip(words, postags)).items(): print(k, v) # print(' ## '.join(postags)) parser = Parser() parser.load(pr_model_path) arcs = parser.parse(words, postags) print(' '.join('%d:%s ' % (arc.head, arc.relation) for arc in arcs)) print('#' * 8) recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) netag = recognizer.recognize(words, postags) for word, ntag in zip(words, netag): if ntag != 'O': # print('ntag') print(word + ' / ' + netag) print(' / '.join(netag)) # 命名实体识别 word_list = ['欧几里得', '是', '西元前', '三', '世纪', '的', '希腊', '数学家', '。'] postags_list = ['nh', 'v', 'nt', 'm', 'n', 'u', 'ns', 'n', 'wp'] nertags = recognizer.recognize(word_list, postags_list) for word, ntag in zip(word_list, nertags): if ntag != 'O': print(word + '/' + ntag) #print (" ".join(word_list)) print(' '.join(nertags)) segmentor.release() postagger.release() parser.release() recognizer.release()
class pyltp_model(): def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'): cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.postagger = Postagger() # 初始化实例 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger.load(pos_model_path) # 加载模型 self.recognizer.load(ner_model_path) # 加载模型 def token(self, sentence): words = self.segmentor.segment(sentence) # 分词 words = list(words) postags = self.postagger.postag(words) # 词性标注 postags = list(postags) netags = self.recognizer.recognize(words, postags) # 命名实体识别 netags = list(netags) result = [] for i, j in zip(words, netags): if j in ['S-Nh', 'S-Ni', 'S-Ns']: result.append(j) continue result.append(i) return result def close(self): self.segmentor.release() self.postagger.release() self.recognizer.release() # 释放模型
def ltp_ner_data(): """使用 LTP 进行命名实体识别""" LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目录的路径 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 result = [] file = [(const.qc_train_pos, const.qc_train_ner), (const.qc_test_pos, const.qc_test_ner)] for i in range(2): with open(file[i][0], 'r', encoding='utf-8') as f: for line in f.readlines(): attr = line.strip().split('\t') words_pos = attr[1].split(" ") words = [word.split('/_')[0] for word in words_pos] postags = [word.split('/_')[1] for word in words_pos] netags = recognizer.recognize(words, postags) # 命名实体识别 res = ' '.join([ "{}/_{}".format(words[i], netags[i]) for i in range(len(words)) ]) result.append("{}\t{}\n".format(attr[0], res)) with open(file[i][1], 'w', encoding='utf-8') as f: f.writelines(result) result.clear() recognizer.release() # 释放模型
def segmentsentence(sentence): segmentor = Segmentor() postagger = Postagger() parser = Parser() recognizer = NamedEntityRecognizer() segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model") postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model") # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model") recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model") ############# word_list = segmentor.segment(sentence) postags_list = postagger.postag(word_list) nertags = recognizer.recognize(word_list, postags_list) ############ for word, ntag in zip(word_list, nertags): if ntag == 'Nh': entity_list.append(word) print(" ".join(word_list)) print(' '.join(nertags)) ############ segmentor.release() postagger.release() # parser.release() recognizer.release() return word_list
def ner_data(): # 分词模型 segmentor = Segmentor() segmentor.load('cws.model') # 词性标注模型 postagger = Postagger() postagger.load('pos.model') # 命名实体模型 recognizer = NamedEntityRecognizer() NamedEntityRecognizer.load('ner.model') # 加载将要被分词的数据 data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig') datas = data_csv['title'] util = Utils() data_processed = open('./data_processed_recognizer.csv', 'w', encoding='utf-8') for data in datas: words = segmentor.segment(data) postags = postagger.postag(words) word_split = ' '.join(words).split(' ') netags = recognizer.recognize(words, postags) netag_split = ' '.join(netags).split(' ') concat_word = util.concat(word_split, netag_split, tag='netags') data_processed.write(concat_word + '\n') data_processed.close()
class pyltp_impl(Seg): def __init__(self, dictpath, mode='seg'): super().__init__(mode) from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer self.ltp_seg = Segmentor() self.ltp_pos = Postagger() self.ltp_ner = NamedEntityRecognizer() self.ltp_seg.load(os.path.join(dictpath, 'cws.model')) if mode != 'seg': self.ltp_pos.load(os.path.join(dictpath, 'pos.model')) if mode == 'ner': self.ltp_ner.load(os.path.join(dictpath, 'ner.model')) def impl_func(self, sentence): seg_res = self.ltp_seg.segment(sentence) if self.mode == 'seg': return seg_res pos_res = self.ltp_pos.postag(seg_res) if self.mode == 'postag': return [(word, tag) for (word, tag) in zip(seg_res, pos_res)] ner_res = self.ltp_ner.recognize(seg_res, pos_res) return [(word, tag) for (word, tag) in zip(seg_res, ner_res)]
class Ltp(NerModel): def __init__(self): super(Ltp, self).__init__() self._model_path = "./model/ltp/" self._seg = Segmentor() self._pos = Postagger() self._recognizer = NamedEntityRecognizer() self._load_model() self._object_str = "[INFO] This is ltp object!" print("[INFO] All model is load!") def __repr__(self): return self._object_str def _load_model(self): self._seg.load(self._model_path + "cws.model") self._pos.load(self._model_path + "pos.model") self._recognizer.load(self._model_path + "ner.model") def get_entity(self, sentence): words = self._seg.segment(sentence) pos = self._pos.postag(words) ner = self._recognizer.recognize(words, pos) entity = [w for w, s in zip(words, ner) if s != 'O'] if entity: return "".join(entity) if len(entity) > 1 else entity[0]
def ner(words, postags): # print('命名实体开始!') recognizer = NamedEntityRecognizer() recognizer.load('D:\\ltp_data\\ner.model') #加载模型 netags = recognizer.recognize(words, postags) #命名实体识别 for word, ntag in zip(words, netags): pass # print(word+'/'+ ntag) # while(ntag == "B-Ni" or ntag == "I-Ni" or ntag=="E-Ni"): # ntag_company1.append(word) # if(ntag=="E-Ni"): # break ## ## # while(ntag == "B-Ni" or ntag == "I-Ni" or ntag=="E-Ni"): # ntag_company2.append(word) # if(ntag=="E-Ni"): # break recognizer.release() #释放模型 nerttags = list(netags) nerwords = list(words) return nerttags, nerwords
def get_ner_list(words_list, postag_list): ner = NamedEntityRecognizer() ner.load(ner_model_path) ner_list = list(ner.recognize(words_list, postag_list)) ner.release() return ner_list
def name_recognition(words, postags): ''' 命名实体识别 :param words:分词结果 :param postags:标注结果 :return: ''' recognizer = NamedEntityRecognizer() #初始化实例 recognizer.load('E:\\NLP-homework\\ltp-data-v3.3.1\\ltp_data\\ner.model') #模型加载 netags = recognizer.recognize(words, postags) #识别命名实体 result = '' for i in range(0, len(netags)): if i < len(words) - 2: if 's' in netags[i]: if 'O' in netags[ i + 1] and words[i + 1] != '' and words[i + 1] != ',': if 's' in netags[i + 2]: result += words[i] + words[i + 1] + words[i + 2] + "" print(result) # for word, ntag in zip(words, netags): # print word + '/' + ntag recognizer.release() return netags
class Parse_Util(object): def __init__(self, lexicon_path='./data/lexicon'): # 分词 self.segmentor = Segmentor() # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) self.segmentor.load(cws_model_path) # 词性标注 self.postagger = Postagger() self.postagger.load(pos_model_path) # 依存句法分析 self.parser = Parser() self.parser.load(par_model_path) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # jieba 分词 # jieba.load_userdict(lexicon_path) def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() # 解析句子 def parse_sentence(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs) return words, postags, netags, arcs
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def ltp_word(self): """创建一个方法,用来进行句子的分词、词性分析等处理。""" # 分词 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(self.content) #print("*************分词*****************") #print("\t".join(words)) # 词性标注 postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) #print("*************词性标注*************") #print(type(postags)) #print("\t".join(postags)) # 依存句法分析 parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) #print("*************依存句法分析*************") #print(type(arcs)) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 把依存句法分析结果的head和relation分离出来 arcs_head = [] arcs_relation = [] for arc in arcs: arcs_head.append(arc.head) arcs_relation.append(arc.relation) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) #print("*************命名实体识别*************") #print("\t".join(netags)) """ # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "pisrl.model")) roles = labeller.label(words, postags, arcs) print("*************语义角色标注*************") for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) """ segmentor.release() postagger.release() parser.release() recognizer.release() #labeller.release() # 调用list_conversion函数,把处理结果列表化 words_result = list_conversion(words, postags, netags, arcs_head, arcs_relation) return words_result
def get_all_name(r_filename,w_file): # global nlp LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径 # 分词 segmentor = Segmentor() # 初始化 segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型 # 词性标注 postagger = Postagger() # 初始化 postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型 # 命名实体识别 recognizer = NamedEntityRecognizer() # 实例化 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) f_r=open(r_filename,"r",encoding="utf-8") f_w=open(w_file,"w",encoding="utf-8") count=0 for line in f_r: count+=1 lines=line.strip("\n").replace(r"\n","") # print("----------"+lines) words = segmentor.segment(lines) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) sen=get_some_idea(line,netags,words) print(sen) if sen: for key in sen: sens="\t".join(list(set([data[1] for data in sen[key]]))) f_w.write(key +"\t"+sens +"\n") # nlp.close() f_r.close() f_w.close()
def locationNER(text): #先分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 #print ('\t'.join(words)) segmentor.release() #再词性标注 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 postagger.release() # 释放模型 #最后地理实体识别 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for i in range (0,len(netags)): if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]: results.append(words[i-1]+words[i]+words[i+1]) if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]: results.append(words[i]) return results
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def name_entity_recognize(req): if req.method == 'POST': #print '-----------word_class_analyse START -----\r\n' intext = req.POST["intext"].encode('utf-8', 'ignore') words = segmentor(intext) tags = posttagger(words) recognizer = NamedEntityRecognizer() # recognizer.load('/usr/local/src/ltp_data/ner.model') recognizer.load(ner_model_path) #recognizer = settings.RECOGNIZER netags = recognizer.recognize(words, tags) # 命名实体识别 outtext = '{"result":[' for word, tag in zip(words, netags): # print word+'/'+tag + '\r\n' outtext += '{"tag":"' + "%s" % tag + '",' outtext += '"content"' + ':"' + word + '"},' outtext = outtext.rstrip(',') + ']}' response = HttpResponse(outtext) response["Access-Control-Allow-Origin"] = "*" response["Access-Control-Allow-Methods"] = "POST" response["Access-Control-Max-Age"] = "1000" response["Access-Control-Allow-Headers"] = "*" return response
def name_recognition(words, postags): """ 命名实体识别 :param words:分词 :param postags:标注 :return: """ recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load( 'D:/Program Files/ltp-models/3.3.1/ltp-data-v3.3.1/ltp_data/ner.model' ) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 # 地名标签为 ns result = '' for i in range(0, len(netags)): if i < len(words) - 2: if 's' in netags[i]: if 'O' in netags[ i + 1] and words[i + 1] != ',' and words[i + 1] != ',': if 's' in netags[i + 2]: result += words[i] + words[i + 1] + words[i + 2] + " " print result # for word, ntag in zip(words, netags): # print word + '/' + ntag recognizer.release() # 释放模型 return netags
def e_recognize(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 #for word, ntag in zip(words, netags): #print(word + '/' + ntag) recognizer.release() # 释放模型 return netags
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for word, ntag in zip(words, netags): print(word + '/' + ntag) recognizer.release() # 释放模型 return netags
def get_ner(words, postags): """ ltp 命名实体识别 """ ner_model_path = os.path.join(LTP_TOP_DIR, 'ner.model') recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) netags = recognizer.recognize(words, postags) recognizer.release() return list(netags)
def ltp_name_entity_recognizer(LTP_DATA_DIR, words, postags): # 命名实体识别模型路径,模型名称为`ner.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 recognizer.release() # 释放模型 return netags
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('/Users/chenming/Spyder/3.3.1/ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for word, ntag in zip(words, netags): print (word + '/' + ntag) recognizer.release() # 释放模型 return netags
def entity_recognize(cutting_list, tagging_list): ner_model_path = os.path.join(LtpParser.ltp_path, 'ner.model') from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) ne_tags = recognizer.recognize(cutting_list, tagging_list) recognizer.release() return ne_tags
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('../ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 # for word,tag in zip(words,netags): # print word+'/'+tag recognizer.release() # 释放模型 return netags
def get_ner(self, word_list, postag_list, model): recognizer = NamedEntityRecognizer() recognizer.load(model) netags = recognizer.recognize(word_list, postag_list) # 命名实体识别 # for word, ntag in zip(word_list, netags): # print(word + '/' + ntag) recognizer.release() # 释放模型 return list(netags)
def get_length(self, filename): segmentor = Segmentor() segmentor.load('cws.model') postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('ner.model') # 加载模型 f = open(filename, 'r') l_dict = {} doc = [] q_ner = [] all_sum = 0 for i in f: doc.append(i) all_doc_n = len(doc) for k in range(all_doc_n): sum = 0 doc_list = doc[k].replace('{"pid": ', '').replace('"document": [', '').replace(']}', '').split(',') # print(doc_list) doc_n = int(doc_list[0]) sentense_num = len(doc_list) # for d in doc_list[1:]: # d = d[2:-2] for i in range(1, sentense_num): d = doc_list[i][2:-1] words = [] cut_words = '\t'.join(segmentor.segment(d)) words_list = cut_words.split('\t') if words_list == ['']: continue postags = postagger.postag(words_list) # 词性标注 pos_line = '\t'.join(postags) q_pos_list = pos_line.split('\t') netags = recognizer.recognize(words_list, postags) # 命名实体识别 ner_line = '\t'.join(netags) ner_list = ner_line.split('\t') sum += len(words_list) ner_str = '' # print(ner_list) for nr in range(len(ner_list)): if ner_list[nr][0] != 'O': if ner_list[nr][0] == 'S' or ner_list[nr][0] == 'E': ner_str += words_list[nr] q_ner.append(ner_str) ner_str = '' else: ner_str += words_list[nr] all_sum += sum l_dict[doc_n] = sum # print(q_ner) q_ner = list(set(q_ner)) with open('ner_word.txt', 'w') as f: for qn in q_ner: f.write(qn) f.write('\n')
def segment(self, texts, use_tag_filter=True): # 初始化实例 # global word_list, netags, postags, relation, heads words = [] pos = [] ner = [] rel = [] hea = [] segmentor = Segmentor() segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic') # 加载模型,参数是自定义词典的文件路径 self.dic_list postagger = Postagger() postagger.load(self.pos_model_path) recognizer = NamedEntityRecognizer() recognizer.load(self.ner_model_path) parser = Parser() parser.load(self.pas_model_path) for text in texts: text = text.lower() word_list = segmentor.segment(text) word_list = [word for word in word_list if len(word) > 1] # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None] # .decode('utf8') 保留中英文 word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] # 去除停用词 # 词性标注 posttags = postagger.postag(word_list) postags = list(posttags) # NER识别 netags = recognizer.recognize(word_list, postags) # 句法分析 arcs = parser.parse(word_list, postags) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id] # 匹配依存父节点词语 if use_tag_filter: dic = dict(zip(word_list, postags)) word_list = [x for x in dic.keys() if dic[x] in self.tags_filter] words.append(word_list) pos.append(postags) ner.append(netags) rel.append(relation) hea.append(heads) segmentor.release() postagger.release() recognizer.release() parser.release() return words, pos, ner, rel, hea
def pyltp_ner(text): # 识别机构名-pyltp LTP_DATA_DIR = Path.cwd().parent / 'ltp_model' # ltp模型存放路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 words_list = list(words) # words_list列表保存着分词的结果 segmentor.release() # 释放模型 # 词性标注 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 postags_list = list(postags) # postags_list保存着词性标注的结果 postagger.release() # 释放模型 # 命名体识别 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 netags_list = list(netags) # netags_list保存着命名实体识别的结果 data = {"reg": netags, "words": words, "tags": postags} # print(data) recognizer.release() # 释放模型 # 去除非命名实体 a = len(words_list) words_list_1 = [] postags_list_1 = [] netags_list_1 = [] for i in range(a): if netags_list[i] != 'O': words_list_1.append(words_list[i]) postags_list_1.append(postags_list[i]) netags_list_1.append(netags_list[i]) # 提取机构名 a1 = len(words_list_1) organizations = [] for i in range(a1): if netags_list_1[i] == 'S-Ni': organizations.append(words_list_1[i]) elif netags_list_1[i] == 'B-Ni': temp_s = "" temp_s += words_list_1[i] j = i + 1 while j < a1 and (netags_list_1[j] == 'I-Ni' or netags_list_1[j] == 'E-Ni'): temp_s += words_list_1[j] j = j + 1 organizations.append(temp_s) orignizations = list(set(organizations)) # 对公司名去重 return orignizations
def ner(words, postags): global recognizer if recognizer is None: ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 # print(list(zip(list(words), list(postags), list(netags)))) return list(netags)
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
def mingming_shiti(words,postags): """命名实体。机构名(Ni)人名(Nh)地名(Ns)""" recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print ("\t".join(netags))
postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()