class NamedEntity: def __init__(self, user_dict): self.ltp = LTP() # 默认加载Small模型 # user_dict.txt 是词典文件, max_window是最大前向分词窗口 self.ltp.init_dict(path=user_dict, max_window=4) def entity_recognition(self, text: list): """ 命名实体识别 :param text: 原始文本 :return: 从原始文本中抽取的命名实体 """ seg, hidden = self.ltp.seg(text) # 分词 ner = self.ltp.ner(hidden) entity = [] for tag, start, end in ner[0]: entity.append(seg[0][start:end+1][0]) return entity
import os from ltp import LTP from ..config.config import LEX_PATH ltp = LTP(path="base") ltp.init_dict(path=os.path.join(LEX_PATH, 'aerospace_lexicon.txt')) def cut(sent): segment, _ = ltp.seg([sent]) return segment[0] def pos_cut(sent): segment, hidden = ltp.seg([sent]) pos = ltp.pos(hidden) return [tuple(segment[0]), tuple(pos[0])]
# -*- coding: utf-8 -*- """ Created on Thu Aug 13 10:08:36 2020 @author: DELL """ import os from ltp import LTP from ..config.config import LEX_PATH ltp = LTP(path = "base") ltp.init_dict(path=os.path.join(LEX_PATH, 'air_lexicon.txt')) def cut(sent): segment, _ = ltp.seg([sent]) return segment[0] def pos_cut(sent): segment, hidden = ltp.seg([sent]) pos = ltp.pos(hidden) return [tuple(segment[0]), tuple(pos[0])]
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 """ RESOURCE_DIR = os.path.abspath( os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "resource")) def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR): self.default_user_dict_dir = user_dict_dir # 加载ltp模型 self.ltp = LTP(model_type) # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue self.ltp.init_dict(file_path) # # 词性标注模型 # self.postagger = Postagger() # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # # 命名实体识别模型 # self.recognizer = NamedEntityRecognizer() # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # # 依存句法分析模型 # self.parser = Parser() # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: self.ltp.add_words([entity]) segment, hidden = self.ltp.seg([sentence]) return segment[0], hidden def postag(self, segment, hidden): """对分词后的结果进行词性标注 Args: segment: list,分词后的结果 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.ltp.pos(hidden) for i in range(len(segment)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, segment[i], postags[0][i]) words.append(word) return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ _, hidden = self.ltp.seg([word], is_preseged=True) post_tag = self.ltp.pos(hidden) return post_tag[0] def netag(self, words, hidden): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.ltp.ner(hidden, as_entities=False) words_netag = EntityCombine.combine(words, netags[0]) return words_netag def parse_seged(self, words): lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 _, hidden = self.ltp.seg([lemmas], is_preseged=True) arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def parse(self, words, hidden): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def close(self): """关闭与释放nlp""" pass
from ltp import LTP # text='我现在在天津,我想知道这里的大学都有什么学校.' #加入用户词典. from ltp import LTP ltp = LTP() # user_dict.txt 是词典文件, max_window是最大前向分词窗口 # 注意max_window一定要开大, 开大字典里面词组最大长度. ltp.init_dict(path="user_dict.txt", max_window=6) # 也可以在代码中添加自定义的词语 ltp.add_words(words=["肖申克的救赎", "长江大桥"], max_window=6) def searchKG(kglist,text): # 用bert来算距离的 tmp3 = []
from ltp import LTP import pandas as pd df = pd.read_csv('分词以后的药食同源.csv', encoding='utf-8') ltp = LTP() # user_dict.txt 是词典文件, max_window是最大前向分词窗口 ltp.init_dict(path="中药大词典合集.txt", max_window=4) # segment, _ = ltp.seg(["他叫汤姆去拿外衣。"]) print(segment)
class NLP: """ 在LTP分析的结果上进行封装 """ def __init__(self, default_model_dir=LTP4_MODEL_DIR, user_dict_dir=USER_DICT_DIR): self.ltp = LTP(path=default_model_dir) for file in os.listdir(user_dict_dir): self.ltp.init_dict(path=os.path.join(user_dict_dir, file)) self.sentences = [] self.postags = [] self.nertags = [] self.dep = [] def segment(self, sentences): self.sentences = sentences lemmas, hidden = self.ltp.seg(sentences) return lemmas, hidden def postag(self, lemmas, hidden): """ 根据postag的结果抽取words :param lemmas: :param hidden: :return: """ words = [] postags = self.ltp.pos(hidden) self.postags = postags for idx_sent, postags_sent in enumerate(postags): words_sent = [] for i in range(len(postags_sent)): # 词的编号从1开始 word = WordUnit(i + 1, lemmas[idx_sent][i], postags_sent[i]) words_sent.append(word) words.append(words_sent) # for i in range(len(postags)): # word = WordUnit(i+1, lemmas[i], postags[i]) # words.append(word) return words def nertag(self, words, hidden): """ 根据nertag的结果抽取words,将ner得到的信息作为pos的纠正和补充,例如n->ni/ns/nl :param lemmas: :param hidden: :return: """ # Nh 人名 Ni 机构名 Ns 地名 nertags = self.ltp.ner(hidden) self.nertags = nertags ''' 为了进行三元组提取,使用到ner信息,需要将一些ner分析后的词进行合并得到新词。 NOTE:NER之后可能将一些tokens合并成一个word 例如: [['高克', '访问', '中国', ',', '并', '在', '同济', '大学', '发表', '演讲', '。']] [['nh', 'v', 'ns', 'wp', 'c', 'p', 'nz', 'n', 'v', 'v', 'wp']] [[('Nh', 0, 0), ('Ns', 2, 2), ('Ni', 6, 7)]] [[(1, 2, 'SBV'), (2, 0, 'HED'), (3, 2, 'VOB'), (4, 2, 'WP'), (5, 9, 'ADV'), (6, 9, 'ADV'), (7, 8, 'ATT'), (8, 6, 'POB'), (9, 2, 'COO'), (10, 9, 'VOB'), (11, 2, 'WP')]] ''' ner2pos = {'Nh': 'nh', 'Ns': 'ns', 'Ni': 'ni'} n = 1 #for i in range(len(words)): for idx_sent, nertags_sent in enumerate(nertags): for item in nertags_sent: for i in range(item[1], item[2] + 1): words[idx_sent][i].nertag = item[0] words[idx_sent][i].postag = ner2pos[item[0]] # for item in nertags: # for i in range(item[1], item[2]+1): # words[i].postag = ner2pos[item[0]] return words def dependency(self, words, hidden): """ 根据dp结果,抽取words信息,用于之后的三元组抽取。(主要是词之间的依赖关系) :param hidden: :return: """ sentences = [] dep = self.ltp.dep(hidden) for idx_sent, dep_sent in enumerate(dep): for i in range(len(words[idx_sent])): if i < len( dep_sent ): # [(1, 2, 'ATT'), (2, 3, 'ATT')]] 省略了(3, 0, 'HED) words[idx_sent][i].head = dep_sent[i][ 1] # 记录的是word的ID,不是下标 words[idx_sent][i].dependency = dep_sent[i][2] # 同时记录每个词的在dp树上的子节点 ## dep_sent[i][1]是head_word的ID ## child_words记录:子节点ID和边的依赖 words[idx_sent][dep_sent[i][1] - 1].child_words.append( (dep_sent[i][0], dep_sent[i][2])) sentences.append( SentenceUnit(self.sentences[idx_sent], self.nertags[idx_sent], words[idx_sent])) return sentences