def seg_with_ltp40(in_file, out_file_path, manual_seg_file): # initialization model ltp = LTP() line_list = [] # save seg_result corpus = construct_corpus(in_file) f = open(out_file_path, "w", encoding='utf-8') for line in corpus: line_list.append(line) # 将每句话变成列表["Xxxx"] seg_result, hidden = ltp.seg(line_list) f.write("=".join(seg_result[0]) + "\n") line_list.clear() f.flush() # test qps corpus = construct_corpus(in_file, 1) start = time.time() for line in corpus: segment, hidden = ltp.seg(list(line)) end = time.time() qps = round(len(corpus) / (end - start), 2) # test accuracy p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file) return qps, p, r, f1, line_aver_length
class Ner: def __init__(self): self.ltp = LTP() def preprocess(self, sent): return re.sub('\s+', '', sent) def ner(self, sents): assert not any(re.search(r'\s', x) for x in sents), "no space is allowed" psents = [x for x in sents if x != ''] if len(psents) == 0: return [[] for x in sents] segment, hidden = self.ltp.seg(psents) ne = self.ltp.ner(hidden) anes = [] for sseg, sne in zip(segment, ne): nes = [] slens = [0] + [len(x) for x in sseg] for i in range(1, len(slens)): slens[i] += slens[i - 1] for t, x, y in sne: if t == 'Ns': nes.append([slens[x], slens[y + 1]]) anes.append(nes) fnes = [] cur = 0 for s in sents: if s == '': fnes.append([]) else: fnes.append(anes[cur]) cur += 1 return fnes
def ltp_func(text_list): ltp = LTP() seg, hidden = ltp.seg(text_list) pos = ltp.pos(hidden) result = [] for idx, val in enumerate(seg[0]): pag = [val, pos[0][idx]] result.append('/'.join(pag)) return result
def dependency(self): sentence = self.sentence sentences = [] sentences.append(sentence) ltp = LTP() seg, hidden = ltp.seg(sentences) dep = ltp.dep(hidden) print(seg) print(dep) pass
def work_summary_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) wa, ha = ltp.seg(paList) pa = ltp.pos(ha) return wa, pa
def findFood(self, sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list posTags = posTags[0] #词性标注结果list dep = ltp.dep(hidden)[0] #依存句法分析结果list relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 string = '' for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': string += words[i] string += ' ' return string
def findFood(sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list print(words) posTags = posTags[0] #词性标注结果list print(posTags) dep = ltp.dep(hidden)[0] #依存句法分析结果list for t in dep: print(t) relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': print("找到了一种食物:" + words[i])
def WriteTest(readfile, savefile): with open(readfile, "r", encoding="utf-8") as rfp: ltp = LTP() logger.info("Processing file:%s ." % (readfile)) with open(savefile, 'w', encoding='utf-8') as wfp: for row in tqdm(rfp, desc="file %s process" % (readfile)): sent1, sent2 = row.split('\t') seg, hid = ltp.seg([sent1, sent2]) sdp = ltp.sdp(hid, mode='tree') pos = ltp.pos(hid) tmpitem = { 'sentence1': [seg[0], pos[0], sdp[0]], 'sentence2': [seg[1], pos[1], sdp[1]] } jsonline = json.dumps(tmpitem) wfp.write(jsonline + "\n")
class NamedEntity: def __init__(self, user_dict): self.ltp = LTP() # 默认加载Small模型 # user_dict.txt 是词典文件, max_window是最大前向分词窗口 self.ltp.init_dict(path=user_dict, max_window=4) def entity_recognition(self, text: list): """ 命名实体识别 :param text: 原始文本 :return: 从原始文本中抽取的命名实体 """ seg, hidden = self.ltp.seg(text) # 分词 ner = self.ltp.ner(hidden) entity = [] for tag, start, end in ner[0]: entity.append(seg[0][start:end+1][0]) return entity
def is_word(sentence): from ltp import LTP r""" Judge whether it is a word. :param str sentence: input sentence string sentence: input sentence string :return bool: is a word or not """ if sentence[0] == sentence[1]: return True ltp = LTP() seg, hidden = ltp.seg([sentence]) pos = ltp.pos(hidden) pos = pos[0] if len(pos) == 1 and pos[0] == 'n': return False return True
def work_detail_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] pbList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) temp = val[3:] for v in temp: pbList.append(v) # print(paList) # print(pbList) sa, ha = ltp.seg(paList) sb, hb = ltp.seg(pbList) pa = ltp.pos(ha) pb = ltp.pos(hb) return sa, sb, pa, pb
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer): ltp_res = [] for i in range(0, len(lines), 100): res = ltp_tokenizer.seg(lines[i:i + 100])[0] res = [get_chinese_word(r) for r in res] ltp_res.extend(res) assert len(ltp_res) == len(lines) bert_res = [] for i in range(0, len(lines), 100): res = bert_tokenizer(lines[i:i + 100], add_special_tokens=True, truncation=True, max_length=512) bert_res.extend(res["input_ids"]) assert len(bert_res) == len(lines) ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) input_tokens.append(token) input_tokens = add_sub_symbol(input_tokens, chinese_word) ref_id = [] # We only save pos of chinese subwords start with ##, which mean is part of a whole word. for i, token in enumerate(input_tokens): if token[:2] == "##": clean_token = token[2:] # save chinese tokens' pos if len(clean_token) == 1 and _is_chinese_char( ord(clean_token)): ref_id.append(i) ref_ids.append(ref_id) assert len(ref_ids) == len(bert_res) return ref_ids
def new_generate_ltp_results(): # 加载模型 ltp_model = '../../ltp_models/base1' ltp = LTP(path=ltp_model) # 读取原句子 data = read_file_in_ltp('../data/train_base.json') sentences = list(map(lambda x: x['content'], data)) segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], [] for sent in tqdm(sentences): # 分词 segmented0, hidden = ltp.seg([sent]) # 词性标注 cur_pos = ltp.pos(hidden) # 命名实体识别 cur_ner = ltp.ner(hidden) # 语义角色标注 cur_srl = ltp.srl(hidden) # 依存句法分析 cur_dep = ltp.dep(hidden) # 语义依存分析 (树) cur_sdp_tree = ltp.sdp(hidden, mode='tree') # 语义依存分析 (图) cur_sdp_graph = ltp.sdp(hidden, mode='graph') segmented.append(segmented0[0]) pos.append(cur_pos[0]) ner.append(cur_ner[0]) srl.append(cur_srl[0]) dep.append(cur_dep[0]) sdp_tree.append(cur_sdp_tree[0]) sdp_graph.append(cur_sdp_graph[0]) # 生成句子与分词的对应 sent_seg_matches = sentence_segment_match(data, segmented) pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb')) return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer, batch_size=1000): """ Args: lines: 每行一个中文段落, ltp_tokenizer: ltp的tokenizer处理器 bert_tokenizer: bert的tokenizer处理器 Returns: """ ltp_res = [] # batch_size等于100,每次处理100行, print(f"开始用ltp模型进行分词处理...") for i in tqdm(range(0, len(lines), batch_size)): #调用ltp进行分词 res = ltp_tokenizer.seg(lines[i:i + batch_size])[0] #过滤出分词后都是中文的部分 res = [get_chinese_word(r) for r in res] #加到ltp_res ltp_res.extend(res) assert len(ltp_res) == len(lines) # eg: ltp_res中的文本处理的结果 [ ['效果', '一直', '用户', '感觉'],....] #bert也进行tokenizer, 每次处理100行 print(f"开始用bert tokenizer模型进行token处理...") bert_res = [] for i in tqdm(range(0, len(lines), batch_size)): res = bert_tokenizer(lines[i:i + batch_size], add_special_tokens=True, truncation=True, max_length=512) bert_res.extend(res["input_ids"]) # eg: bert_res [ [101, 5439, 4500, 2787, 749, 8024, 671, 4684, 1762, 4500, 4007, 2051, 8024, 2697, 6230, 2190, 2971, 4576, 2971, 3779, 3126, 3362, 2923, 1962, 4638, 102]...] #确保行数相同 print(f"开始生成对应关系") assert len(bert_res) == len(lines) print_num = 5 ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) input_tokens.append(token) # eg : ['[CLS]', '古', '##龙', '洗', '发', '##水', ',', '洗', '完', '头', '##发', '不', '干', '##燥', '、', '也', '不', '容', '##易', '油', '、', '不', '痒', ',', '味', '##道', '持', '##久', ',', '非', '##常', '柔', '##顺', ',', '而', '##且', '泡', '##泡', '很', '容', '##易', '冲', '##洗', '干', '##净', '泡', '##沫', '非', '##常', '细', '##腻', ',', '洗', '后', '头', '##发', '很', '滑', '很', '顺', ',', '洗', '了', '之', '##后', '就', '头', '##发', '很', '蓬', '##松', ',', '很', '香', ',', '而', '##且', '我', '洗', '了', '是', '没', '##有', '头', '##皮', '##屑', '的', '[SEP]'] input_tokens = add_sub_symbol(input_tokens, chinese_word) ref_id = [] # 我们只保存以##开头的中文子词的位置,这意味着它是全词的一部分。 for i, token in enumerate(input_tokens): if token[:2] == "##": clean_token = token[2:] # 只保存中文子词的后半部分,把和bert的对应关系,保存到ref_id中,ref_id是这个句子的所有子词的后半部分映射 if len(clean_token) == 1 and _is_chinese_char( ord(clean_token)): ref_id.append(i) #打印前5个示例 if print_num > 0: example_num = 5 - print_num print(f"第{example_num}个样本是: {lines[example_num]}") print(f"第{example_num}个样本的ltp分词后结果: {ltp_res[example_num]}") print( f"第{example_num}个样本的bert toknizer后结果: {bert_res[example_num]}") print( f"第{example_num}个样本的bert toknizer被ltp的全词处理后的结果: {input_tokens}" ) print( f"第{example_num}个样本的bert的token对应的子词的后半部分的位置的最终的ref_id: {ref_id}" ) print() print_num -= 1 ref_ids.append(ref_id) #判断每个句子的子词的映射关系都保存了 assert len(ref_ids) == len(bert_res) return ref_ids
from ltp import LTP import time start = time.time() # 紀錄執行時間 ltp = LTP() segment, hidden = ltp.seg(["這隻程式可以幫我們把網站資料爬下來"]) pos = ltp.pos(hidden) # ner = ltp.ner(hidden) # srl = ltp.srl(hidden) # dep = ltp.dep(hidden) # sdp = ltp.sdp(hidden) print(segment) # print(hidden) print(pos) end = time.time() print(end - start)
kglist = ['大学', '人口', '面积'] text = '姚明的妻子的丈夫的妻子' text = '我现在在天津,这里有什么大学?' text = '姚明的妻子' ## #--------从测试看出来,ner本身对问题有干扰,所以在kglist里面要去除. # tiaozhuan=searchKG(kglist=['地点','地址','大小','颜色','老婆','丈夫'],text='我家住在和平区哪个地方') # print(tiaozhuan,"jieguo shi !!!!!!!!!!!!!!!!") ## # 加入句子成分跳转. seg, hidden = ltp.seg([text]) # sdp = ltp.sdp(hidden, graph=False) print(seg, "seg") pos = ltp.pos(hidden) ner = ltp.ner(hidden) print("ner", ner) srl = ltp.srl(hidden) dep = ltp.dep(hidden) sdp = ltp.sdp(hidden) print(ner, "ner结果") seg = seg[0] dep = dep[0] sdp = sdp[0] print(sdp, "语义分析!!!!!!!!!!!!!!!!!!!") # 太难用了.
class Server(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden, fast=False) batch_sdp = self.ltp.sdp(hidden, mode='mix') for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result def serve(self, port: int = 5000, n_process: int = None): if n_process is None: n_process = 1 if sys.platform == 'win32' else 8 fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', color=True) root_logger = logging.getLogger() console_handler = logging.StreamHandler() file_handler = logging.FileHandler('server.log') console_handler.setFormatter(fmt) file_handler.setFormatter(fmt) root_logger.addHandler(console_handler) root_logger.addHandler(file_handler) app_log.setLevel(logging.INFO) gen_log.setLevel(logging.INFO) access_log.setLevel(logging.INFO) app_log.info("Model is loading...") app_log.info("Model Has Been Loaded!") app = Application([(r"/.*", LTPHandler, dict(ltp=self))]) server = HTTPServer(app) server.bind(port) server.start(n_process) ioloop.IOLoop.instance().start()
def save_as_txt(data): from ltp import LTP import random ltp = LTP() for row in data: id = row[0] school_id = ("000" + str(row[1]))[-4:] texts = row[2] textlines = texts.split('\n') shortened_textlines = [] for line in textlines: line_len = len(line) if line_len > 100: for i in range(line_len // 100): shortened_textlines.append(line[i * 100:(i + 1) * 100]) else: shortened_textlines.append(line) text = ' '.join(shortened_textlines) path = './data/' + str(school_id) if os.path.exists(path): pass else: os.makedirs(path) with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"), 'w', encoding='UTF-8') as file: file.write(text) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="") # T2 报告人 68 71 曹进德 # R2 报告人_单位 Arg1: T2 Arg2: T1 seg, hidden = ltp.seg([text]) ner = ltp.ner(hidden) ner_info = [] entities_nh = [] entities_ni = [] print(type(text)) print() for i in ner[0]: if (i[0] == 'Nh'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if (len(entity) > 1): entities_nh.append(entity) elif (i[0] == 'Ni'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if entity in schoolnames: entities_ni.append(entity) for entity in set(entities_nh): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(300 + count) + "\t姓名 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 for entity in set(entities_ni): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(400 + count) + "\t单位 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员') iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(500 + count) + "\t职称 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 date_1 = r"([0-9]+年[0-9]+月[0-9]+日)" # |([0-9]+月[0-9]+日) date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)" date_3 = r"([0-9]+月[0-9]+日)" flag = False count = 0 ## 方式1 pattern = re.compile(date_1) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 flag = True if (flag is False): pattern = re.compile(date_3) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 ## 方式2 pattern = re.compile(date_2) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"), 'w', encoding='UTF-8') as file: print([text]) print(ner_info) file.writelines(ner_info) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
# coding:utf-8 # 这里是ltp的运行示例,直接运行即可 from ltp import LTP ltp = LTP() seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) pos = ltp.pos(hidden) print(seg[0]) print(pos[0]) result = [] for idx, val in enumerate(seg[0]): pag = [val, pos[0][idx]] result.append(pag) print(result)
# import synonyms import json # sen1 = "程序员" # sen2 = "软件工程师" # r = synonyms.compare(sen1, sen2, seg=True) # print(r) # ddp = DDParser() # # 单条句子 # re = ddp.parse("语文老师") # print(re) from ltp import LTP ltp = LTP() # 默认加载 Small 模型 seg, hidden = ltp.seg(["语文老师"]) pos = ltp.pos(hidden) ner = ltp.ner(hidden) srl = ltp.srl(hidden) dep = ltp.dep(hidden) sdp = ltp.sdp(hidden) print(seg) # print(hidden) print(pos) print(pos) print(ner) print(srl) print(dep)
class CnProcessor: r""" Text Processor class implement NER. """ _instance_lock = threading.Lock() def __init__(self): self.__ner = None self.__pos = None # Single instance mode def __new__(cls, *args, **kwargs): if not hasattr(CnProcessor, "_instance"): with CnProcessor._instance_lock: if not hasattr(CnProcessor, "_instance"): CnProcessor._instance = object.__new__(cls) return CnProcessor._instance @staticmethod def word_tokenize(sent): r""" tokenize fiction :param str sent: the sentence need to be tokenized :return: list.the tokens in it """ assert isinstance(sent, str) return [word for word in sent] def get_ner(self, sentence): r""" NER function. :param str sent: the sentence need to be ner :return two forms of tags The first is the triple form (tags,start,end) The second is the list form, which marks the ner label of each word such as 周小明去玩 ['Nh', 'Nh', 'Nh', 'O', 'O'] """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [], [] if self.__ner is None: self.__ner = LTP() seg, hidden = self.__ner.seg([sentence]) seg = seg[0] ner = self.__ner.ner(hidden) ner = ner[0] ner_label = len(sentence) * ['O'] for i in range(len(ner)): tag, start, end = ner[i] tmp = 0 for j in range(start): tmp += len(seg[j]) start = tmp tmp = 0 for j in range(end + 1): tmp += len(seg[j]) end = tmp ner[i] = (tag, start, end - 1) for j in range(start, end): ner_label[j] = tag return ner, ner_label def get_pos_tag(self, sentence): r""" pos tag function. :param str sentence: the sentence need to be ner :return: the triple form (tags,start,end) """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [] if self.__pos is None: # get pos tag self.__pos = LTP() seg, hidden = self.__pos.seg([sentence]) pos = self.__pos.pos(hidden) seg = seg[0] pos = pos[0] pos_tag = [] cnt = 0 for tag in range(len(pos)): pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1]) cnt += len(seg[tag]) return pos_tag
proxies = {'http': 'http://localhost:8888', 'https': 'http://localhost:8888'} #初始化分词工具 ltp = LTP(proxies=proxies) sentences = pdfreader.getTestFromPdf()['Text'] seg = [] sdp = [] dep = [] pos = [] cluster = [] #对句子进行分词以及语义依存分析 for st in sentences: if (st != ''): seg_temp, hidden = ltp.seg([st]) # 获得语义依存关系 sdp.append(ltp.sdp(hidden)[0]) #获得词性列表 pos.append(ltp.pos(hidden)[0]) #获得分词列表 seg.append(seg_temp[0]) #获得语法依存关系 dep.append(ltp.dep(hidden)[0]) #初始化储存三元组的list resultTriad = [] for index in range(len(dep)): r = getTriad(dep[index], seg[index], pos[index]) resultTriad.append(r)
from ltp import LTP from config import LTP4_MODEL_DIR from ESIServer.component.open_relation_extraction.nlp import NLP nlp = NLP() class NLPLTP: def __init__(self, default_model_dir=LTP4_MODEL_DIR): print(default_model_dir) self.ltp = LTP(path=default_model_dir) if __name__ == '__main__': ltp = LTP(path=LTP4_MODEL_DIR) seg, hidden = ltp.seg(["他叫汤姆去拿外衣。", "他就读于复旦大学。", "吴秀波diss李明"]) pos = ltp.pos(hidden) ner = ltp.ner(hidden) dep = ltp.dep(hidden) srl = ltp.srl(hidden) sdp = ltp.sdp(hidden) print(seg) print(pos) print(ner) print(dep) print(srl) print(sdp) origin_sentences = ["他叫汤姆去拿外衣。", "他就读于复旦大学。"] lemmas, hidden = nlp.segment(origin_sentences) words_postag = nlp.postag(lemmas, hidden)
class EntityDescribeExtractByRoleAnalysisV1(object): """ 基于ltp 角色分析进行实体和实体描述信息抽取 """ def __init__(self, ltp_model_path="tiny"): self.ltp = LTP(ltp_model_path) def single_sentence(self, input_sentence, ind=0): seg, hidden = self.ltp.seg([input_sentence]) words = seg[ind] pos = self.ltp.pos(hidden)[ind] roles = self.ltp.srl(hidden, keep_empty=False)[ind] filter_p = {"是", "为"} role_list = ["A0", "A1", "A2", "A3", "A5"] # print(words) spo_list = [] for role in roles: r_indx, r_list = role p_value = words[r_indx] r_list = list(filter(lambda x: x[0] in role_list, r_list)) if len(r_list) != 2: continue sub = r_list[0] obj = r_list[1] if sub[0] not in role_list: continue if obj[0] not in role_list: continue if sub[2] >= r_indx: continue if obj[1] <= r_indx: continue # 谓语过滤 if p_value not in filter_p: continue if p_value == "为": sub, obj = obj, sub # 词性过滤 if pos[sub[2]] not in ["n", "nz"]: continue sub_value = words[sub[1]:sub[2] + 1] obj_value = words[obj[1]:obj[2] + 1] # print("".join(sub_value), p_value, "".join(obj_value)) spo_list.append(("".join(sub_value), p_value, "".join(obj_value))) return spo_list def extract_info(self, input_sentence_list): """ 抽取实体描述信息 Args: input_sentence_list: Returns: entity_describe_res: List[{"sentence": xxx, "entity": xxx, "describe":xxx}] """ entity_describe_res = [] for i, sentence in enumerate(input_sentence_list): sentence = sentence.strip() if len(sentence) < 10: continue if len(sentence) > 100: continue if not re.fullmatch("^[\u4e00-\u9fa5_a-zA-Z]{1,15}是.+$", sentence): continue out_spo_list = self.single_sentence(sentence) for spo in out_spo_list: entity_describe_res.append({ "sentence": sentence, "entity": spo[0], "describe": spo[2] }) return entity_describe_res # def single_sentence_v2(self, input_sentence): # sentence_feature = [(cut.DEPREL, cut.LEMMA) for cut in HanLP.parseDependency(input_sentence)] # if sentence_feature[0][0] != "主谓关系": # return True # if ("核心关系", "是") not in sentence_feature: # return True # return False def multi_extract_info(self, input_sentence_list): pool = multiprocessing.Pool(processes=3) spo_res = [] for i, sentence in enumerate(input_sentence_list): sentence = sentence.strip() if len(sentence) == 0: continue out_spo_list = pool.apply_async(self.single_sentence, (sentence, )) # out_spo_list = self.single_sentence(sentence) spo_res.append(out_spo_list) # spo_res.append((sentence, out_spo_list)) pool.close() pool.join() spo_res = [spo.get() for i, spo in enumerate(spo_res)] return spo_res
def text_work_summary_parser_ltp(textList): ltp = LTP() wa, ha = ltp.seg(textList) pa = ltp.pos(ha) return wa, pa
# sentence = '整天不是吃饭就是睡觉,活得真像一头猪。' # sentence = '整天睡觉,活得真像一头猪。' # sentence = 'x要人名币。' # sentence = '一切动物和植物都是生物。' # sentence = '美丽的小猪和优雅的小熊,是一对好朋友。' # sentence = '任意的整数和任意的浮点数的乘积是浮点数。' # sentence = '并非x0都是偶数。' # sentence = '并非每个自然数都是偶数。' # sentence = '1+1*3*(2+4)' # sentence = '(1+1)*3/(2+4)' # sentence = '(4-3)/(5-3)' # sentence = '4-3*5+2' # sentence = '4-3*5' sentence = '4*3' seg, hidden = ltp.seg([sentence]) pos = ltp.pos(hidden) sdp = ltp.sdp(hidden) srl = ltp.srl(hidden, keep_empty=False) dep = ltp.dep(hidden) print(seg) print(pos) print(sdp) print(srl) print(dep) # a = [(1, [(2, (3, 4)), (2, (3, 4))]), (10, [(20, (30, 40)), (20, (30, 40))])] # b = [] # def re(a): # x = type(a)
"运营管理问题":6, "程序规章手册缺陷":7, "安检空管维修资质等其它":8} myclasses = ["航空器系统/部件失效", "航空器设计制造缺陷", "机务人员致灾", "机组人员致灾", "零件生产质量问题", "运营管理问题", "程序规章手册缺陷", "安检空管维修资质等其它"] from ltp import LTP ltp = LTP() X_dataset, _ = ltp.seg(X.tolist()) X_dataset = np.array(X_dataset, dtype=object) y_dataset = [[],[],[],[],[],[],[],[]] for i in range(len(myclasses)): for item in y.tolist(): if myclasses[i] in item: y_dataset[i].append(1) else: y_dataset[i].append(0) y_datatset = np.mat(y_dataset) np.save("X_ltp.npy", X_dataset) np.save("y_ltp.npy",y_dataset)
class Server(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device) else: self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden) batch_sdp = self.ltp.sdp(hidden) for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result
from ltp import LTP ltp = LTP() segment, hidden = ltp.seg(["南京市长江大桥。"]) print(segment) sentences = ltp.sent_split(["南京市长江大桥。", "汤姆生病了。他去医院了。"]) print(sentences) segment, hidden = ltp.seg(sentences) print(segment) print(hidden) pos_tags = ltp.pos(hidden) print(pos_tags)
class Run(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden) batch_sdp = self.ltp.sdp(hidden) for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result def test(self, sentences: List[str] = None): self.ltp.add_words("DMI与主机通讯中断") if sentences is None: sentences = ["他叫汤姆去拿外衣。"] res = self._predict([sentence.strip() for sentence in sentences]) print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False)) def save(self, out='ltp.npz'): import numpy as np nps = {} for k, v in self.ltp.model.state_dict().items(): k = k.replace("gamma", "weight").replace("beta", "bias") nps[k] = np.ascontiguousarray(v.cpu().numpy()) np.savez(out, **nps) config = self.ltp.config with open('config.json', 'w', encoding='utf-8') as f: json.dump(config, f, indent=2) def test_seged(self): import torch sentences = [ 'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。', '他叫汤姆去拿外衣。', "我去长江大桥玩。" ] seg, hidden = self.ltp.seg(sentences) seged, hidden_seged = self.ltp.seg(seg, is_preseged=True) hidden: dict hidden_seged: dict for key, value in hidden.items(): if isinstance(value, torch.Tensor): test = torch.sum(value.float() - hidden_seged[key].float()).numpy() print(key, test) print(seg == seged)