def load_all_model(): """返回分词,词性标注,命名实体识别,依存解析等实例对象""" LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, './temp_file/cut_external_dict/cut_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load_with_lexicon(pos_model_path, './temp_file/pos_external_dict/pos_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 fname = r"E:/MYGIT/model/wiki_stopwords/wiki_word2vec.kv" # model_wv.save(fname) model_wv = KeyedVectors.load(fname, mmap='r') return [segmentor, postagger, recognizer, parser, model_wv]
class Opinion(object): def __init__(self, Dsent, industry_id): self.industry_id = industry_id self.Dsent = Dsent self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path, '%s/conf/posttags.txt' % dir_path) self.sql = mysqls() self.opinionword = read_opinion(self.industry_id) self.n_v = [] def cut_word(self, sents): # 分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] # HMM=False return words def word_sex(self, ): # 获取词性 postags = list(self.postagger.postag(self.words)) # 词性标注 num = 0 #副词或者名词后面一个词 for tag in postags: if tag in ['d']: if num + 1 < len(postags): if num != 0 and postags[num + 1] in ['n', 'v']: if self.words[num+1] not in self.opinionword \ and len(self.words[num + 1].decode('utf-8','ignore')) > 1: self.n_v.append(self.words[num + 1]) #动词或者n词 if tag in ['a', 'i', 'b']: if self.words[num] not in self.opinionword\ and len(self.words[num].decode('utf-8','ignore')) > 1: self.n_v.append(self.words[num]) num += 1 return postags def prepare(self, ): for id, sentences in self.Dsent.items(): split_sentence = re.split( ur'[,,()()、: …~?。!. !?]?', sentences.decode('utf-8', 'ignore').strip()) for sent in split_sentence: self.words = self.cut_word(sent.encode('utf-8', 'ignore')) self.postags = self.word_sex() cword = Counter(self.n_v) lresult = heapq.nlargest(500, cword.items(), key=lambda x: x[1]) # lword = [] # for rg in lresult: # w, n = rg # lword.append(w) # self.sql.insert(self.industry_id, lword) self.postagger.release() # 释放模型 # self.parser.release() # 释放模型 # outfile.close() return lresult
def new_relation_find(words, sentence): """ 新关系发现 :param words: :param sentence: :return: """ # 存放三元组的字典 tuple_dict = dict() index0 = -1 index1 = -1 bool = False for entity_word in entity_words: if sentence.find(entity_word) != -1: if tuple_dict: # 返回为true说明有重复部分 if has_same(tuple_dict[index0], entity_word): continue index1 = sentence.find(entity_word) tuple_dict[index1] = entity_word bool = True break else: index0 = sentence.find(entity_word) tuple_dict[index0] = entity_word if bool is False: return "", "", "" # 排序结果为list # tuple_dict = sorted(tuple_dict.items(), key=lambda d: d[0]) words = "/".join(words).split("/") for key, value in tuple_dict.items(): tuple_word = value words = init_words(tuple_word, words) # 对于已经重构的词进行词标注 postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt') # 加载模型 postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) postagger.release() # 释放模型 # 发现新关系 relation_word = "" index_word = 0 for index, postag in enumerate('\t'.join(postags).split('\t')): index_word += len(words[index]) if index_word >= len(sentence): break if postag == 'v' and index_word - min(index0, index1) <= 2 and max(index0, index1) - index_word <= 2 \ and not has_same(tuple_dict[index0], words[index]) and not has_same(tuple_dict[index1], words[index]) \ and words[index] not in wrong_relation: relation_word = words[index] break if relation_word == "": return "", "", "" return tuple_dict[min(index0, index1)], tuple_dict[max(index0, index1)], relation_word
def postaggers(words): postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt') # 加载模型 postags = postagger.postag(words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 return postags
class Model: # 模型类 def __init__(self): self.segmentor = None self.postagger = None self.recognizer = None self.parser = None self.model_wv = None def load_model(self): """返回分词,词性标注,命名实体识别,依存解析等实例对象""" LTP_DATA_DIR = Myconfig.get_path('ltp_data') LTP_TEMP_DIR = Myconfig.get_path('temp_file') assert LTP_DATA_DIR assert LTP_TEMP_DIR cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` cut_temp_path = os.path.join(LTP_TEMP_DIR, 'cut_external_dict/cut_external_dict') self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon(cws_model_path, cut_temp_path) # 加载模型 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` pos_temp_path = os.path.join(LTP_TEMP_DIR, 'pos_external_dict/pos_external_dict') self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path, pos_temp_path) # 加载模型 ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` self.parser = Parser() # 初始化实例 self.parser.load(par_model_path) # 加载模型 fname = Myconfig.get_path('vec.kv') # 或取模型目录 assert fname # model_wv.save(fname) self.model_wv = KeyedVectors.load(fname, mmap='r') def release_all_model(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() del (self.model_wv) _ = gc.collect() _ = gc.collect()
def __init__(self, lexicon_path='./data/lexicon'): postagger = Postagger() postagger.load_with_lexicon(pos_model_path, lexicon_path) parser = Parser() parser.load(par_model_path) segmentor = Segmentor() segmentor.load_with_lexicon(cws_model_path, lexicon_path) recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) self.postagger = postagger self.parser = parser self.segmentor = segmentor self.recognizer = recognizer
def SrlFunction(contents): from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, 'E:\\ltp_data_v3.4.0\\personal_seg.txt') words = segmentor.segment(contents) # 分词 k = 1 for word in words: print(word + str(k) + ' ', end='') k = k + 1 print('\n') # print('\t'.join(words)) segmentor.release() # 释放模型 wordslist = list(words) from pyltp import Postagger postagger = Postagger() # postagger.load(pos_model_path) postagger.load_with_lexicon(pos_model_path, 'D:\\ltp_data_v3.4.0\\personal_pos.txt') postags = postagger.postag(wordslist) print('\t'.join(postags)) postagger.release() # wordslist = ['人力资源社会保障局','主管','医疗保险','工作'] # postags = ['n','v','n','v'] from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(wordslist, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(wordslist, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def __init__(self, lexicon_path='./data/lexicon'): postagger = Postagger() postagger.load_with_lexicon(pos_model_path, lexicon_path) parser = Parser() parser.load(par_model_path) # segmentor = Segmentor() # segmentor.load_with_lexicon(cws_model_path, lexicon_path) recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) self.postagger = postagger self.parser = parser # self.segmentor = segmentor self.recognizer = recognizer jieba.load_userdict(lexicon_path) jieba.enable_parallel(12)
class Ltp: "https://pyltp.readthedocs.io/zh_CN/latest/" def __init__(self, seg=True, pos=False, ner=False, parse=False, seg_lexicon_path=None, pos_lexicon_path=None): cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型 if seg: self.segmentor = Segmentor() #分词 if seg_lexicon_path: self.segmentor.load_with_lexicon(cws_model_path, seg_lexicon_path) else: self.segmentor.load(cws_model_path) if pos: # 输入分词结果 self.postagger = Postagger() #词性标注 self.postagger.load(pos_model_path) if pos_lexicon_path: self.postagger.load_with_lexicon(pos_model_path, pos_lexicon_path) else: self.postagger.load(pos_model_path) if ner: # 输入分词和标注结果 self.ner = NamedEntityRecognizer() #命名主体识别 self.ner.load(ner_model_path) if parse: # 输入分词和标注结果 self.parser = Parser() #依存分析 self.parser.load(par_model_path) def release(self): try: self.segmentor.release() self.postagger.release() self.ner.release() self.parser.release() except AttributeError: pass def __del__(self): self.release()
def posttagger(words): postagger = Postagger() # 初始化实例 #postagger.load('E:\\python2.7 install\\pyltp-master\\ltp_data\\pos.model') # 加载模型 postagger.load_with_lexicon(pos_model_path, 'D:\\LTP\\ltp_data\\pos.txt') #postagger = settings.POSTAGGER #if settings.POSTAGGER is None: # settings.POSTAGGER = Postagger() # settings.POSTAGGER.load_with_lexicon('/mnt/hgfs/ubuntu-share/pyltp-master/ltp_data/pos.model','/mnt/hgfs/ubuntu-share/pyltp-master/ltp_data/pos.txt') #postagger = settings.POSTAGGER postags = postagger.postag(words) # 词性标注 print "词性标注:\n" for word, tag in zip(words, postags): print word + '/' + tag postagger.release() # 释放模型 return postags
def work(): segmentor = Segmentor() # 初始化实例(分词,词性,Ner) postagger = Postagger() recognizer = NamedEntityRecognizer() cws_model_path = "D:\\Academic\\LTP\\3.4.0\ltp_data_v3.4.0\cws.model" pos_model_path = "D:\\Academic\\LTP\\3.4.0\ltp_data_v3.4.0\pos.model" segmentor.load_with_lexicon(cws_model_path, "\\dictionary.txt") # 加载模型和词典 postagger.load_with_lexicon(pos_model_path, "\\dictionary.txt") recognizer.load('D:\\Academic\\LTP\\3.4.0\\ltp_data_v3.4.0\\ner.model') stopwords = stopwordslist('stoplist.txt') for line in Input.readlines(): words = segmentor.segment(line) # 分词 words_list = list(words) word_list = [] for word in words_list: if word not in stopwords : print(''.join(word) + ' ', end='') word_list.append(word) print() postags = postagger.postag(word_list) # 词性分析 postags_list = list(postags) for word, tag in zip(word_list, postags_list): print(word + ' /' + tag) print() netags = recognizer.recognize(word_list, postags) # 命名实体识别 for word, tag in zip(word_list, netags): if tag != 'O': print(word + '/' + tag) print() postagger.release() # 释放模型 segmentor.release() recognizer.release()
sents = SentenceSplitter.split(text) # 分句 segmentor = Segmentor() # 初始化实例 #segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, 'D:\python\ltp_data_v3.4.0\lexicon') segmentor_2 = Segmentor() # 初始化实例 # #segmentor.load(cws_model_path) # 加载模型 segmentor_2.load_with_lexicon( cws_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label' ) # 加载模型#segmentor.load_with_lexicon(cws_model_path, 'D:\python\毕业设计\lexicon') # 加载模型,第二个参数是您的外部词典文件路径 postagger = Postagger() # 初始化实例 postagger_2 = Postagger() # 初始化实例 postagger.load_with_lexicon(pos_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_1') # 加载模型 postagger_2.load_with_lexicon( pos_model_path, 'D:\python\ltp_data_v3.4.0\lexicon_label_1') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer_2 = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 recognizer_2.load(ner_model_path) # 加载模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 def is_name_entity(entity): return entity != 'O'
class NlpLtp(): def __init__(self): print('Load pyplt models...') start = time.time() self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon(cws_model_path, user_dict_seg) # 加载模型 self.postagger = Postagger() self.postagger.load_with_lexicon(pos_model_path, user_dict_pos) #self.parser = Parser() # 初始化实例 #self.parser.load(par_model_path) # 加载模型 #self.labeller = SementicRoleLabeller() # 初始化实例 #self.labeller.load(srl_model_path) # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) self.nerdict = dict() elapsed = time.time() - start print('Load pyplt models finished in ', elapsed) # 释放模型 def __del__(self): print('Release pyplt models...') self.segmentor.release() self.postagger.release() self.recognizer.release() #self.parser.release() #self.labeller.release() print('Release pyplt models finished.') def sentence(self, content): return SentenceSplitter.split(content) def segment(self, text): return self.segmentor.segment(text) def postag(self, wordlist): return self.postagger.postag(wordlist) #def parse(self, wordlist, postags): # return self.parser.parse(wordlist, postags) #def role_label(self, wordlist, postags, arcs): # return self.labeller.label(wordlist, postags, arcs) def get_keywords(self, txt): words = pltobj.segment(txt) postags = pltobj.postag(words) ners = pltobj.ner(words, postags) keywords = list() for k, val in ners.items(): keywords.append(k) return keywords def add_entity(self, word, tag): if word in self.nerdict: count = self.nerdict[word][1] else: count = 0 self.nerdict[word] = [tag, count + 1] #命名实体结果如下,ltp命名实体类型为:人名(Nh),地名(NS),机构名(Ni); #ltp采用BIESO标注体系。 #B表示实体开始词,I表示实体中间词,E表示实体结束词,S表示单独成实体,O表示不构成实体。 def ner(self, wordlist, postags): ners = self.recognizer.recognize(wordlist, postags) for i in range(0, len(ners)): #print( wordlist[i], postags[i], ners[i] ) if postags[i] in NOUN_LIST: word = wordlist[i].strip() if len(word) > 1: self.add_entity(word, postags[i]) if ners[i] == 'S-Ns' or ners[i] == 'S-Nh' or ners[i] == 'S-Ni': word = wordlist[i].strip() if len(word) > 1 or word in PROVINCE_NAME: #实体名长度大于1,保存词性 self.add_entity(word, postags[i]) return self.nerdict def clean_ner(self): self.nerdict = dict() def get_ner(self): return self.nerdict
from similarity import * sub_pattern, con_pattern, obj_pattern = 0, 0, 0 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') parser_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') pisrl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model') role_model = svm_load_model("../data/models/role.model") segmentor = Segmentor() segmentor.load_with_lexicon(cws_model_path, '../data/configure/lexicon.txt') postagger = Postagger() postagger.load_with_lexicon(pos_model_path, '../data/configure/pos.txt') parser = Parser() parser.load(parser_model_path) recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) labeller = SementicRoleLabeller() labeller.load(pisrl_model_path) class Record(): def __init__(self): self.original_sentence = ''
class LtpParser(): def __init__(self): LTP_DIR = "/home/ubuntu/model/ltp/ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, "word_dict.txt")) #加载外部词典 self.postagger = Postagger() self.postagger.load_with_lexicon( os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR, "n_word_dict.txt")) #加载外部词典 # self.parser = Parser() # self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) #实体识别 # #加载停词 # with open(LTP_DIR + '/stopwords.txt', 'r', encoding='utf8') as fread: # self.stopwords = set() # for line in fread: # self.stopwords.add(line.strip()) '''把实体和词性给进行对应''' def wordspostags(self, name_entity_dist, words, postags): pre = ' '.join( [item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [ word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0] ] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''根据实体识别结果,整理输出实体列表''' def entity(self, words, netags, postags): ''' :param words: 词 :param netags: 实体 :param postags: 词性 :return: ''' name_entity_dict = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word + '_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word + '_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dict['nhs'] = self.modify(name_entity_list, words, postags, 'nh') name_entity_dict['nis'] = self.modify(organization_entity_list, words, postags, 'ni') name_entity_dict['nss'] = self.modify(place_entity_list, words, postags, 'ns') return name_entity_dict def modify(self, entity_list, words, postags, tag): modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['stat_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [ words[int(start_index)] + '/' + postags[int(start_index)] ] else: consist = [ words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index) + 1) ] entity_dict['consist'] = consist entity_dict['name'] = ''.join( tmp.split('_')[0] for tmp in subs) + '/' + tag modify.append(entity_dict) return modify '''词性和实体''' def post_ner(self, words): postags = list(self.postagger.postag(words)) # words_filter =[] # postags = [] # for word, postag in zip(words, self.postagger.postag(words)): # if 'n' in postag: # postags.append(postag) # words_filter.append(word) nerags = self.recognizer.recognize(words, postags) return postags, nerags def parser_process(self, sentence): words = list(self.segmentor.segment(sentence)) post, ner = self.post_ner(words) # 词性和实体 name_entity_dist = self.entity(words, ner, post) words, postags = self.wordspostags(name_entity_dist, words, post) return words, postags
from pyltp import Parser from pyltp import Segmentor from pyltp import Postagger import networkx as nx import pylab import re import matplotlib.pyplot as plt from pylab import mpl from graphviz import Digraph import numpy as np # 初始化实例 postagger = Postagger() pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') postagger.load_with_lexicon(postagger, '../data/user_dict.txt') # 加载模型 segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, '../data/user_dict.txt') # 加载模型 SEN_TAGS = [ "SBV", "VOB", "IOB", "FOB", "DBL", "ATT", "ADV", "CMP", "COO", "POB", "LAD", "RAD", "IS", "HED" ] def parse(s, isGraph=False): """ 对语句进行句法分析,并返回句法结果 """ tmp_ner_dict = {} num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
class pyltp_worker(object): #初始化,创建实例,加载基础模型 def __init__(self, model_path): self.LTP_MODEL_DIR = model_path self.segmentor = Segmentor() #分词 self.postagger = Postagger() #词性标注 self.recognizer = NamedEntityRecognizer() #命名实体识别 self.parser = Parser() #依存句法分析 self.load_model() #加载基础模型 def load_model(self): self.cws_model_path = os.path.join(self.LTP_MODEL_DIR, 'cws.model') #分词模型路径 self.pos_model_path = os.path.join(self.LTP_MODEL_DIR, 'pos.model') #词性标注模型路径 self.ner_model_path = os.path.join(self.LTP_MODEL_DIR, 'ner.model') #命名实体识别模型路径 self.par_model_path = os.path.join(self.LTP_MODEL_DIR, 'parser.model') #依存句法分析模型路径 self.segmentor.load(self.cws_model_path) #加载cws模型 self.postagger.load(self.pos_model_path) #加载pos模型 self.recognizer.load(self.ner_model_path) #加载ner模型 self.parser.load(self.par_model_path) #加载parser模型 #释放实例 def end(self): self.segmentor.release() #分词 self.postagger.release() #词性标注 self.recognizer.release() #命名实体识别 self.parser.release() #依存句法分析 #加入自定义词典 def add_cws_userdict(self, lexicon_path): self.segmentor.load_with_lexicon(lexicon_path) def add_pos_userdict(self, lexicon_path): self.postagger.load_with_lexicon(lexicon_path) def add_ner_userdict(self, lexicon_path): self.recognizer.load_with_lexicon(lexicon_path) def add_par_userdict(self, lexicon_path): self.parser.load_with_lexicon(lexicon_path) #分句。按照标点符号来分,返回句子列表。 def sentsplit(self, text): sentences = SentenceSpliter.split(text) sentences_list = list(sentences) return sentences_list #分词。返回词列表。 def cws(self, text): words = self.segmentor.segment(text) words_list = list(words) return words_list #词性标注。返回词性标注列表。 def pos(self, words): postags = self.postagger.postag(words) postags_list = list(postags) return postags_list #命名实体识别。返回命名实体类型列表。 def ner(self, words, postags): nertags = self.recognizer.recognize(words, postags) nertags_list = list(nertags) return nertags_list #依存句法分析。 def par(self, words, postags): arcs = self.parser.parse(words, postags) pr_list = [] word_list = [] word_pos_list = [] source_list = [] source_pos_list = [] relation_list = [] for i, k in enumerate(arcs): word = words[i] word_pos = postags[i] source = words[k.head - 1] source_pos = postags[k.head - 1] relation = k.relation word_list.append(word) word_pos_list.append(word_pos) source_list.append(source) source_pos_list.append(source_pos) relation_list.append(relation) pr_list.append([word, word_pos, source, source_pos, relation]) df_list = [ word_list, word_pos_list, source_list, source_pos_list, relation_list ] return pr_list, df_list, arcs
corpus_test = X_test_sentence['ner'].map(filtered_segment).tolist() # 提取tfidf特征 vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) print(X_train.shape) print(X_test.shape) # 提取句法特征 # 1、企业实体间距离 # 2、企业实体间句法距离 # 3、企业实体分别和关键触发词的距离 # 4、实体的依存关系类别 postagger = Postagger() postagger.load_with_lexicon('../data/ltp-models/ltp_data_v3.4.0/pos.model', '../data/user_dict.txt') # 加载模型 segmentor = Segmentor() segmentor.load_with_lexicon('../data/ltp-models/ltp_data_v3.4.0/cws.model', '../data/user_dict.txt') # 加载模型 def shortest_path(arcs_ret, source, target): """ 求出两个词最短依存句法路径,不存在路径返回-1 arcs_ret:句法分析结果表格 source:实体1 target:实体2 """ G = nx.DiGraph() # 为这个网络添加节点... for i in list(arcs_ret.index):
class ExtraModel: def __init__(self): self.nr_table_name = '' self.nrTable = None # 词性标注 pos_model_path = os.path.join(os.path.dirname(__file__), '../data/ltp_data/pos.model') self.postagger = Postagger() # self.postagger.load(pos_model_path) # 依存句法分析 par_model_path = os.path.join(os.path.dirname(__file__), '../data/ltp_data/parser.model') self.parser = Parser() self.parser.load(par_model_path) def getNrTable(self, book): f = open(NR_TABLE_PATH_BASE+book+'.nps.txt') data = f.read().splitlines()[1:] NrTable = [] for it in data: nr = it.split(',')[0] NrTable.append(nr) return NrTable def loadNrTable(self, book, nrTable=None): if(nrTable==None):nrTable = self.getNrTable(book) path = 'data/npList/' + book + '.nps.table' if(os.path.exists(path)): self.postagger.load_with_lexicon('data/ltp_data/pos.model', path) else: out = open('data/npList/' + book, 'w') for nr in nrTable: out.write(nr + ' nh\n') out.close() self.postagger.load_with_lexicon('data/ltp_data/pos.model', path) for nr in nrTable:jieba.add_word(nr) self.nrTable = nrTable self.nr_table_name = book def InputWords(self, words): postags = self.postagger.postag(words) arcs = self.parser.parse(words, postags) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 print(words) print(list(postags)) heads = [-1 if id == 0 else id - 1 for id in rely_id] # 匹配依存父节点词语 for i, v in enumerate(heads): if v == -1: return relation, heads, i, postags return None def addCooNode(self, s, word, sbvlink): if(s.coo): self.addCooNode(s.coo, word, sbvlink) else: verb = Sentence(word) if(word in sbvlink): verb.sbv = sbvlink[word] s.coo = verb return None def addVob(self, s, head, word): if(s!=None): if(s.v == head): s.vob = word else: self.addVob(s.coo, head, word) return None def getMainSentence(self, s): s = self.DialogFliter(s) words = list(jieba.cut(s)) if(not words): return [] relation, heads, root, postags = self.InputWords(words) # print(list(postags)) stack = [words[root]] res = {} cooLink = {} sbvLink = {} n = len(words) while(stack): hed = stack.pop(0) res[words[hed]] = Sentence(words[hed]) for i in range(n): r = relation[i] if(r == 'COO'): h = heads[i] w = words[i] if (h in cooLink): h = cooLink[h] if(h == hed and h!=w): self.addCooNode(res[words[h]],w, sbvLink) cooLink[w] = h if(r == 'VOB'): h = heads[i] if (h in cooLink): h = cooLink[h] if(h==hed): w = words[i] self.addVob(res[words[h]], words[heads[i]], w) if(postags[i]=='v' and (w not in res)): stack.append(w) if(r == 'SBV'): h = heads[i] if(h in cooLink):h = cooLink[h] if(h == hed): verb = res[words[h]] while(verb.v!=heads[i]):verb = verb.coo if(verb.sbv): verb.sbv += words[i] else: verb.sbv = words[i] else: sbvLink[heads[i]] = words[i] text = [] for it in res: text.append(self.getSentence(res[it]).replace('\n','')) # for i in range(len(words)): # print(relation[i] + '(' + words[i] + ', ' + str(heads[i]) + ')',end='+') # print('') # print(s, text) return text def DialogFliter(self, s): res = re.sub('[\'\"‘“].*?[’”\'\"]','',s) # print(res) pos = re.finditer('::',res) content = [] for it in pos: content.append(it.span()) if(len(content)<2): res = res[content[0][1]:] else: res = res[content[0][1]:content[1][0]] return res def readCoo(self, s): text = '' if(s): if s.sbv: text = s.sbv + text text = text + s.v if(s.vob): text = text + s.vob return [text] + self.readCoo(s.coo) return [text] def getSentence(self, sentence): text = [] # print(sentence) if(sentence.vob): text.append(sentence.v+sentence.vob) else: text.append(sentence.v) # print(sentence.coo, sentence.v) if(sentence.coo): text = text + self.readCoo(sentence.coo) text = ','.join(text) if (sentence.sbv): text = sentence.sbv+text else: text = '[unknown]' + text return text def getMain(self, text, book): nr = [] words = list(jieba.cut(text)) relation, heads, root, postags = self.InputWords(words) queue = [root] N = len(words) while (queue): current = queue.pop(0) for i in range(N): if (heads[i] == current): if (relation[i] == 'SBV' or relation[i] == 'ATT'): if (relation[i] == 'SBV'): nr.append((words[i],re.search(words[i],text).start(),postags[i],1)) else: if (words[i] in self.nrTable): nr.append((words[i],re.search(words[i],text).start(),postags[i],1)) elif (relation[i] == 'COO'): queue.append(i) return nr
class LTP: def __init__( self, ltp_data_path=None, seg_lexicon=None, pos_lexicon=None, ): if not ltp_data_path: raise ValueError('请指定ltp用到的模型所在路径!!!') self.ltp_data_path = ltp_data_path # ltp模型目录的路径 self._cws_model_path = os.path.join( self.ltp_data_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self._pos_model_path = os.path.join( self.ltp_data_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self._ner_model_path = os.path.join( self.ltp_data_path, 'parser.model') # 命名实体识别模型路径,模型名称为`pos.model` self._segmentor = Segmentor() # 初始化实例 if seg_lexicon: self._segmentor.load_with_lexicon( self._cws_model_path, seg_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 else: self._segmentor.load(self._cws_model_path) self._postagger = Postagger() # 初始化实例 if pos_lexicon: self._postagger.load_with_lexicon( self._pos_model_path, pos_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 else: self._postagger.load(self._pos_model_path) self._recognizer = NamedEntityRecognizer() # 初始化实例 self._recognizer.load(self._ner_model_path) # 加载模型 def cut(self, text): return self._segmentor.segment(text) def pos(self, text): words = self.cut(text) postags = self._postagger.postag(words) return zip(words, postags) def ner(self, text): """ 命名实体识别,提供三种命名识别,PER人名、LOC地名、ORG机构名 :param text: :return: """ # Nh代表人名, Ni代表机构名,Ns代表地点名字 ner_dict = {'Nh': [], 'Ni': [], 'Ns': []} words = self.cut(text) postags = self._postagger.postag(words) nertags = self._recognizer.recognize(words, postags) ner_tmp = [] for i, tag in enumerate(nertags): if tag == 'O': continue if tag.startswith('S'): tag = tag.split('-')[-1] ner_dict[tag].append(words[i]) elif tag.startswith('B') or tag.startswith('I'): ner_tmp.append(words[i]) continue elif tag.startswith('E'): ner_tmp.append(words[i]) tag = tag.split('-')[-1] ner_dict[tag].append(''.join(ner_tmp)) ner_tmp = [] if ner_tmp: tag = list(nertags)[-1] tag = tag = tag.split('-')[-1] ner_dict[tag].append(''.join(ner_tmp)) ner_map = dict() ner_map['PER'] = ner_dict['Nh'] ner_map['ORG'] = ner_dict['Ni'] ner_map['LOC'] = ner_dict['Ns'] return ner_map def release(self): self._segmentor.release() self._recognizer.release() self._postagger.release()
#coding:utf-8 from pyltp import Postagger from pyltp import Parser import sys import os import jieba import chardet reload(sys) sys.setdefaultencoding('utf-8') dir_path = os.path.dirname(os.path.abspath(__file__)) LTP_DATA_DIR = '/home/wangwei/hotword/hotword/conf/ltp_data' jieba.load_userdict("/home/wangwei/hotword/hotword/conf/jieba_lexicon") postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') postagger.load_with_lexicon(pos_model_path, '/home/wangwei/model/posttags.txt') # tmplist =[] # with open(sys.argv[2],'rb') as f: # for line in f: # if line: # tmplist.append(line.strip()) def cut_word(sents): """ 分词 """ words = [i.encode('utf-8', 'ignore') for i in jieba.cut(sents, HMM=False)] # HMM=False # print sents, '\t'.join(words) return words
class FindAttribute(object): def __init__(self, moniter_word, Dsent, industry_id): self.moniter_word = moniter_word self.industry_id = industry_id self.Dsent = Dsent self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path, '%s/conf/posttags.txt' % dir_path) # self.parser = Parser() # 初始化实例 # self.parser.load(par_model_path) # 加载模型 self.sql = mysqls() self.carattributes, self.sysn, self.dup_word = self.sql.run( industry_id) self.n_v = [] def cut_word(self, sents): # 分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] # HMM=False num = 0 # 处理同义词 for w in words: if w in self.sysn.keys(): words[num] = self.sysn[w] num += 1 return words def word_sex(self, ): # 获取词性 postags = list(self.postagger.postag(self.words)) # 词性标注 num = 0 #副词或者形容词前面的一个词 for tag in postags: if tag in ['a', 'd']: if num != 0 and postags[num - 1] in ['n', 'v']: if self.words[num - 1] not in self.carattributes \ and len(self.words[num - 1].decode('utf-8','ignore')) > 1: self.n_v.append(self.words[num - 1]) #动词或者n词 if tag in ['n', 'v'] and num == 0: if self.words[num] not in self.carattributes\ and len(self.words[num].decode('utf-8','ignore')) > 1: # self.words[num] not in self.dup_word \ self.n_v.append(self.words[num]) num += 1 # print '词性', '\t'.join(postags) return postags def prepare(self, ): for id, sentences in self.Dsent.items(): split_sentence = re.split( ur'[,,()()、: …~?。!. !?]?', sentences.decode('utf-8', 'ignore').strip()) for sent in split_sentence: self.words = self.cut_word(sent.encode('utf-8', 'ignore')) self.postags = self.word_sex() # self.segmentor.release() # 释放模型 # outfile = open('attribute_dup.txt', 'a') # for word in set(self.n_v): cword = Counter(self.n_v) lresult = heapq.nlargest(500, cword.items(), key=lambda x: x[1]) lword = [] for rg in lresult: w, n = rg lword.append(w) # self.sql.insert(self.industry_id, lword) self.postagger.release() # 释放模型 # self.parser.release() # 释放模型 # outfile.close() return lresult
class LtpParser(): def __init__(self): LTP_DIR = "E:\\study\\Projects\\data-mining\\ltp\\ltp_data_v3.4.0" self.segmentor = Segmentor() #self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, "word_dict")) #加载外部词典 self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR, "n_word_dict")) #加载外部词典 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))#实体识别 self.labeller = SementicRoleLabeller() # 语义角色标注 self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model")) #加载停用词 with open(LTP_DIR + '\\stopwords.txt', 'r', encoding='gbk') as fread: self.stopwords = set() for line in fread: self.stopwords.add(line.strip()) '''把实体和词性给进行对应''' def wordspostags(self, name_entity_dist, words, postags): pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''根据实体识别结果,整理输出实体列表''' def entity(self, words, netags, postags): ''' :param words: 词 :param netags: 实体 :param postags: 词性 :return: ''' name_entity_dict = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word + '_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word + '_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dict['nhs'] = self.modify(name_entity_list, words, postags, 'nh') name_entity_dict['nis'] = self.modify(organization_entity_list, words, postags, 'ni') name_entity_dict['nss'] = self.modify(place_entity_list, words, postags, 'ns') return name_entity_dict def modify(self, entity_list, words, postags, tag): modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['stat_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [words[int(start_index)] + '/' + postags[int(start_index)]] else: consist = [words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index) + 1)] entity_dict['consist'] = consist entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag modify.append(entity_dict) return modify '''词性和实体''' def post_ner(self, words): postags = list(self.postagger.postag(words)) # words_filter =[] # postags = [] # for word, postag in zip(words, self.postagger.postag(words)): # if 'n' in postag: # postags.append(postag) # words_filter.append(word) nerags = self.recognizer.recognize(words, postags) return postags, nerags def parser_process(self, sentence): words = list(jieba.cut(sentence)) post, ner = self.post_ner(words) # 词性和实体 name_entity_dist = self.entity(words, ner, post) words, postags = self.wordspostags(name_entity_dist, words, post) return words, postags def analysis(self,sentence): words = list(jieba.cut(sentence)) post, ner = self.post_ner(words) # 词性和实体 arcs = self.parser.parse(words, post) return words,arcs def getWord(self,Type, arcs, words): res = None for i in range(len(words)): if arcs[i].relation == Type: res = words[i] break return res def getFirst(self,List): for i in List: if i is not None: return i return None def getMain(self,sentence): words, arcs = self.analysis(sentence) hed = self.getWord("HED", arcs, words) sbv =self.getWord("SBV", arcs, words) vob =self.getWord("VOB", arcs, words) fob =self.getWord("FOB", arcs, words) adv =self.getWord("ADV", arcs, words) pob =self.getWord("POB", arcs, words) zhu =self.getFirst([sbv, pob]) wei = hed bin = self.getFirst([vob, fob, pob]) string = '{}{}{},(副词:{})'.format(zhu, wei, bin, adv) return string.replace('None', '') def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release()
class LTPParser(Parser): """ 基于LTP实现的Parser LTP对用户自定义词典的支持不是很好,http://www.ltp-cloud.com/support/ 1. 扩展自定义词典后,需要重新编译LTP 2. 分词支持自定义词典,但词性标注不支持 """ def __init__(self, ltp_model_dir, custom_seg_file=None, custom_pos_file=None): """ :param ltp_model_dir: """ super(LTPParser, self).__init__() self._ltp_dir = ltp_model_dir '''加载分词模型''' seg_model_file = os.path.join(self._ltp_dir, 'cws.model') self._segmentor = Segmentor() if custom_seg_file: self._segmentor.load_with_lexicon(seg_model_file, custom_seg_file) else: self._segmentor.load(seg_model_file) '''加载词性标注模型''' self._tagger = Postagger() pos_model_file = os.path.join(self._ltp_dir, 'pos.model') if custom_pos_file: self._tagger.load_with_lexicon(pos_model_file, custom_pos_file) else: self._tagger.load(pos_model_file) '''加载命名实体识别模型''' self._ner = NamedEntityRecognizer() self._ner.load(os.path.join(self._ltp_dir, 'ner.model')) '''加载依存句法分析模型''' self._parser = LParser() self._parser.load(os.path.join(self._ltp_dir, 'parser.model')) def segment(self, txt): return list(self._segmentor.segment(txt)) def pos(self, txt, cache=False): result = None if cache: result = self._get_from_cache(txt) if result is None: tokenized = self.segment(txt) tags = self._tagger.postag(tokenized) result = [] for i, w, t in zip(list(range(len(tokenized))), tokenized, tags): result.append(Token(w, t, i)) self._set_cache(txt, result) return result def ner(self, txt): tokens = self.pos(txt) return list( self._ner.recognize([t.word for t in tokens], [t.pos for t in tokens])) def parse2relations(self, txt): tokens = self.pos(txt, revise=True) words = [t.word for t in tokens] tags = [t.pos for t in tokens] arcs = self._parser.parse(words, tags) result = [] for i, w, p, a in zip(list(range(len(words))), words, tags, arcs): head_token = Token(words[a.head - 1] if a.head > 0 else 'Root', tags[a.head - 1] if a.head > 0 else 'Root', a.head - 1) dep_token = Token(w, p, i) result.append(Relation(a.relation, head_token, dep_token)) return result def parse2sents(self, txt): sents = [] for sent_txt in self.ssplit(txt): sent_relations = self.parse2relations(sent_txt + '。') tokens = set() for relation in sent_relations: if relation.token1.word != 'ROOT': tokens.add(relation.token1) tokens.add(relation.token2) tokens = sorted(tokens, key=lambda t: t.id) # sent = Sentence(''.join([w.word for w in tokens])) sent = Sentence(sent_txt) sent.tokens = tokens sent.relations = sent_relations sents.append(sent) return sents
class LtpParser: def __init__(self): LTP_DIR = "../../res/ltp/ltp_data_v3.4.0" LTP_DIR_USER = "******" self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict def build_parse_child_dict_two(self, words, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) # if child_dict.has_key('SBV'): # print words[index],child_dict['SBV'] child_dict_list.append(child_dict) return child_dict_list '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): # print(words, postags, "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: # arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) parse_child_dict = self.build_parse_child_dict_two(words, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list, parse_child_dict '''parser主函数''' def parser_main_two(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) # 命名实体识别,主要是hi识别一些人名,地名,机构名等。 netags = self.recognizer.recognize(words, postags) # 格式化数据 child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) # 语义角色 roles_dict = self.format_labelrole(words, postags) return words, postags, netags, arcs, child_dict_list, format_parse_list, roles_dict
class PyLTPEntityExtractor(EntityExtractor): name = 'PyLTP_entity_extractor' provides = ["entities"] requires = ['tokens'] defaults = { "model_path": None, # Nh: name Ni: organization Ns: place "part_of_speech": ['nh'], "rename_to_entity": ['username'], # rename 'nh' to 'username' "dictionary_path": None # customize dictionary } def __init__(self, component_config=None): # type: (Optional[Dict[Text, Text]]) -> None super(PyLTPEntityExtractor, self).__init__(component_config) self.model_path = self.component_config.get('model_path') self.dictionary_path = self.component_config.get('dictionary_path') self.segmentor = Segmentor() self.postagger = Postagger() if self.dictionary_path is None: self.segmentor.load(self.model_path + "/cws.model") self.postagger.load(self.model_path + "/pos.model") else: self.segmentor.load_with_lexicon(self.model_path + "/cws.model", self.dictionary_path) self.postagger.load_with_lexicon(self.model_path + "/pos.model", self.dictionary_path) @classmethod def create(cls, cfg): component_conf = cfg.for_component(cls.name, cls.defaults) return PyLTPEntityExtractor(component_conf) @classmethod def required_packages(cls): # type: () -> List[Text] return ["pyltp"] def process(self, message, **kwargs): # type: (Message, **Any) -> None extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", extracted, add_to_output=True) def extract_entities(self, message): # type: (Message) -> List[Dict[Text, Any]] # Set your own model path sentence = message.text words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) result = zip(words, postags) raw_entities = message.get("entities", []) for word, postag in result: part_of_speech = self.component_config["part_of_speech"] rename_to_entity = self.component_config["rename_to_entity"] if postag in part_of_speech: start = sentence.index(word) end = start + len(word) entity_index = part_of_speech.index(postag) rename_entity = rename_to_entity[entity_index] or postag hasAlreadyExtractor = False for obj in raw_entities: if obj and obj['value'] == word: hasAlreadyExtractor = True if not hasAlreadyExtractor: raw_entities.append({ 'start': start, 'end': end, 'value': word, 'entity': rename_entity }) return raw_entities @classmethod def load( cls, model_dir=None, # type: Optional[Text] model_metadata=None, # type: Optional[Metadata] cached_component=None, # type: Optional[Component] **kwargs # type: **Any ): meta = model_metadata.for_component(cls.name) return cls(meta)
class NLPExecutor: def __init__(self): self.seg = Segmentor() self.seg.load(cwsPath) self.pos = Postagger() self.pos.load(posPath) self.parser = Parser() self.parser.load(parserPath) self.tr = TextRank4Sentence() ''' param: text:输入文本 return: 摘要的句子list ''' def generateSummary(self, text): # TODO 摘要生成实现方法待改进 self.tr.analyze(text=text) return self.tr.get_key_sentences(num=1) ''' param: text:输入文本 return: 分句的句子list ''' def splitSentences(self, text): return list(SentenceSplitter.split(text)) ''' param: sent1,sent2:两个句子 return: 两个句子的相似度 ''' def similarity(self, sent1, sent2): if sent1 == '' or sent2 == '': return 0 text1 = self.wordTokenize(sent1) text2 = self.wordTokenize(sent2) texts = [text1, text2] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary)) return similarity[dictionary.doc2bow(text1)][1] # TODO VALIADATES THAT添加放在RUCM生成层 ''' def addValidate(self,sentence): tokens=self.wordTokenize(sentence) tokens[1]='VALIDATES THAT' return ''.join(tokens) ''' ''' param: sentence:一个句子 return: 分词词链,list,标点符号会被作为一个词 ''' def wordTokenize(self, sentence): return list(self.seg.segment(sentence)) ''' param: sentence:一个句子 wordlist:分词词链 return: 仅有词性标注的词性链,index与分词词链对应 ''' def posTag(self, sentence=None, wordlist=None): if sentence is not None: wordlist = list(self.seg.segment(sentence)) return list(self.pos.postag(wordlist)) ''' param: sentence:分词词典的文件路径,每个词独占一行的纯文本文件 wordlist:标注词典的文件路径,每个词及其词性占一行,词与词性标注之间空格分隔,可以有多个词性 return: 无 ''' def dictUpdate(self, segDict=None, posDict=None): if segDict is not None: self.seg.load_with_lexicon(cwsPath, segDict) if posDict is not None: self.pos.load_with_lexicon(posPath, posDict) ''' param: sentence:原始句子 wordlist:句子的分词词链 poslist:词性标注词链 return: 依存句法分析结果 ''' def parse(self, wordlist=None, text=None): if text is not None: wordlist = self.wordTokenize(text) poslist = self.posTag(wordlist=wordlist) return list(self.parser.parse(wordlist, poslist)) ''' param: sentence:Sentence对象 parselist:依存句法分析结果 return: 规范化句式之后的句子 ''' def normalize(self, sentence, parselist=None): # TODO 效果在调试时继续调整): wordlist = sentence.wordlist poslist = self.posTag(wordlist=wordlist) if parselist is None: parselist = self.parse(wordlist=wordlist) newWords = wordlist.copy() # TODO 替换IF,ELSE,THEN,DO,UNTIL #if sentence.type == 'conditional': # TODO if sentence.type != 'then': for i in range(0, len(wordlist)): if wordlist[i] == '如果': newWords[i] = 'IF' sentence.type = 'conditional' elif wordlist[i] == '那么': newWords[i] = 'THEN' elif wordlist[i] == '否则': newWords[i] = 'ELSE' elif wordlist[i] == '直到': newWords[i] = 'UNTIL' if sentence.type != 'conditional': sentence.type = 'circular' elif wordlist[i] == '同时': newWords[i] = 'MEANWHILE' #TODO 去量词效果 if sentence.type == 'then' or sentence.type == 'normal': for i in range(len(parselist) - 1, -1, -1): if parselist[i].relation == 'ATT' and (poslist[i] == 'm' or poslist[i] == 'q'): del newWords[i] if sentence.normalContent is None: sentence.normalContent = '' for word in newWords: sentence.normalContent += word ''' param: parselist:依存句法分析结果 return: 是否为简单句 ''' def isSimple(self, parselist): count = 0 for parse in parselist: if parse.relation == 'SBV': count += 1 if count == 1: return True else: return False ''' param: sentlist:句子集合 sent:单个句子 return: sentlist中与sent相似度最高的句子的索引与相似度 ''' def maxSimilarity(self, sentlist, sent): max = [-1, -1] for i in range(len(sentlist)): similarity = self.similarity(sentlist[i].originContent, sent.originContent) if similarity > max[1]: max = [i, similarity] return max
class SentenceParser(object): """ A class for segmenting text """ def __init__(self): """ Initial """ self.sen_split = SentenceSplitter() self.seg = Segmentor() self.seg.load_with_lexicon(CWS_MODEL, "resource/lexicon") self.pos = Postagger() self.pos.load_with_lexicon(POS_MODEL, "resource/lexicon") self.parser = Parser() self.parser.load(PARSER_MODEL) self.rule = IterDocument("resource/rule") def seg_sentence(self, text): return self.sen_split.split(text) def seg_token(self, text): """ :param text: the raw string :return: a list of token """ return self.seg.segment(text) def pos_tag(self, words): """ :param words: the list of token :return: a list of pos """ return self.pos.postag(words) def parse(self, words, pos): if len(words) == 0 or len(pos) == 0: return WordNode("", "", "", None) arcs = self.parser.parse(words, pos) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) nodes = list(map(lambda x: (x.head, x.relation), arcs)) root_idx = find_x(nodes, 0) root = WordNode(words[root_idx[0]], pos[root_idx[0]], nodes[root_idx[0]][1]) tree = {root_idx[0]: root} queue = root_idx while len(queue): next_idx = queue.pop() for idx in find_x(nodes, next_idx + 1): queue.insert(0, idx) new_node = WordNode(words[idx], pos[idx], nodes[idx][1]) tree[next_idx].next.append(new_node) tree[idx] = new_node return root def extract(self, path): res = [] if len(path) == 0: return res rule = self.rule for p in path: for r in rule: window_size = len(r.split(";")) if len(p) == window_size: if ";".join(map(lambda x: "%s,%s" % (x.relation, x.pos), p)) == r: res.append("".join(map(lambda x: x.token, p))) else: for i in range(len(p) - window_size): p_slice = ";".join( map(lambda x: "%s,%s" % (x.relation, x.pos), p[i:i + window_size])) if p_slice == r: res.append("".join( map(lambda x: x.token, p[i:i + window_size]))) break return res
]) X = pd.read_csv('./x.csv') corpus = X['ner'].map(f).tolist() # print(corpus) tfidf = TfidfVectorizer() tfidf.fit(corpus) tfidf_train = tfidf.transform(corpus) tfidf_feature = pd.DataFrame(tfidf_train.toarray()) postagger = Postagger() # 初始化实例 # postagger.load_with_lexicon('F:\ltp_data\pos.model', '../data/user_dict.txt') # 加载模型 postagger.load_with_lexicon('E:\ltp_data\pos.model', '../data/user_dict.txt') # 加载模型 def parse(s): """ 对语句进行句法分析,并返回句法结果 parse_result:依存句法解析结果 source:企业实体的词序号 target:另一个企业实体的词序号 keyword_pos:关键词词序号列表 source_dep:企业实体依存句法类型 target_dep:另一个企业实体依存句法类型 """ tmp_ner_dict = {} num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
class LtpParser: def __init__(self): LTP_DIR = "D:\python\ltp_data_v3.4.0" Segmentor_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon' Segmentor_label_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_label' Postagger_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_1' Postagger_label_lexicon = 'D:\python\ltp_data_v3.4.0\lexicon_label_1' self.segmentor = Segmentor() self.segmentor_label = Segmentor() cws_model_path = os.path.join(LTP_DIR, "cws.model") self.segmentor.load_with_lexicon( cws_model_path, Segmentor_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 self.segmentor_label.load_with_lexicon( cws_model_path, Segmentor_label_lexicon) # 加载模型,第二个参数是您的外部词典文件路径 self.postagger = Postagger() self.postagger_label = Postagger() pos_model_path = os.path.join(LTP_DIR, "pos.model") self.postagger.load_with_lexicon(pos_model_path, Postagger_lexicon) self.postagger_label.load_with_lexicon(pos_model_path, Postagger_label_lexicon) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: #print('role.index:',role.index) roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list def tag_entity_annotation(self, entity): words = self.segmentor.segment(entity) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) # 命名实体识别 #print('\t'.join(netags)) return words, postags, netags def tag_entity_annotation_v2(self, entity): words = self.segmentor_label.segment(entity) postags = self.postagger_label.postag(words) netags = self.recognizer.recognize(words, postags) # 命名实体识别 #print('\t'.join(netags)) return words, postags, netags