def parser_initial(): par_model_path = os.path.join( get_config('ner', 'LTP_DATA_DIR'), 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 return parser
def parse(words, postags): parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 return arcs
def get_word_dependence(data): """ 获取 词 的依赖关系 :param data: :return: """ print('get_word_dependence') print('load ltp model') LTP_DATA_DIR = './ltp_data_v3.4.0/' # ltp模型目录的路径 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 print('read file') res = [] for txt in data: word_pos_list = txt.strip(' \n').split() res_line = [] words = [] postags = [] for word_pos in word_pos_list: words.append(word_pos.split('@@')[0]) postags.append(word_pos.split('@@')[1]) arcs = parser.parse(words, postags) # 句法分析 arcs = list(arcs) for i in range(len(words)): res_word = [] res_word.append(str(words[i])) res_word.append(str(words[int(arcs[i].head) - 1])) res_word.append(str(arcs[i].relation)) res_line.append(res_word) res.append(res_line) print('release model') parser.release() # 释放模型 return res
def jufa_fenxi(words,postags): """句法分析""" parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
def ltp_word(self): """创建一个方法,用来进行句子的分词、词性分析等处理。""" # 分词 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(self.content) #print("*************分词*****************") #print("\t".join(words)) # 词性标注 postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) #print("*************词性标注*************") #print(type(postags)) #print("\t".join(postags)) # 依存句法分析 parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) #print("*************依存句法分析*************") #print(type(arcs)) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 把依存句法分析结果的head和relation分离出来 arcs_head = [] arcs_relation = [] for arc in arcs: arcs_head.append(arc.head) arcs_relation.append(arc.relation) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) #print("*************命名实体识别*************") #print("\t".join(netags)) """ # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "pisrl.model")) roles = labeller.label(words, postags, arcs) print("*************语义角色标注*************") for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) """ segmentor.release() postagger.release() parser.release() recognizer.release() #labeller.release() # 调用list_conversion函数,把处理结果列表化 words_result = list_conversion(words, postags, netags, arcs_head, arcs_relation) return words_result
def parse(words, postags): parser = Parser() # 初始化实例 parser.load(os.path.join(LTP_DATA_DIR, 'parser.model')) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 return arcs
def init_pyltp(model_dir, dict_file=None): ''' 初始化Pyltp的几个模块 :param model_dir 模型的路径 :param dict_file 分词的外部词典 :return segmentor, postagger, parser, ner ''' segmentor = Segmentor() postagger = Postagger() parser = Parser() ner = NamedEntityRecognizer() cws_model = os.path.join(model_dir, 'cws.model') pos_model = os.path.join(model_dir, 'pos.model') parser_model = os.path.join(model_dir, 'parser.model') ner_model = os.path.join(model_dir, 'ner.model') if dict_file: segmentor.load_with_lexicon(cws_model, dict_file) else: segmentor.load(cws_model) postagger.load(pos_model) ner.load(ner_model) parser.load(parser_model) return segmentor, postagger, parser, ner
def parse(words, postags): # file_dir = "..\\LTP\\ltp_data_v3.4.0\\pos.txt" parser = Parser() parser.load('..\\LTP\\ltp_data_v3.4.0\\parser.model') # file_read = open("..\\LTP\\pos.txt", "rb") arcs = parser.parse(words, postags) # 句法分析 file_out = open(".\\jffx.txt", "w") print("句法分析结果:") print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) file_out.write("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 # texts = file_read.readlines() # for line in texts: # wordlists = dict(line.split(':')) # for words, postags in wordlists: # arcs = parser.parse(words, postags) # print("句法分析结果:") # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # file_out.write("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # file_read.close() file_out.close() return
def _load_testset(self): """ 加载测试集 :return: """ par_model_path = os.path.join(self.ltp_dir, 'parser.model') pos_model_path = os.path.join(self.ltp_dir, 'pos.model') postagger = Postagger() postagger.load(pos_model_path) parser = Parser() parser.load(par_model_path) examples = [] with open(os.path.join(self.data_dir, self.file_name)) as f: for l in tqdm(f): l = json.loads(l) # 分词 pos ner : 中文命名实体识别是字符级模型(bert),所以用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。 text_seg = jieba.lcut(l['text'], HMM=False) poses = ' '.join(postagger.postag(text_seg)).split() arcs = parser.parse(text_seg, poses) arcses = ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs).split() examples.append( self.align_bert_4_inference(l, text_seg, arcses)) return examples
def __init__(self, config): self.config = config random_seed = config['random_seed'] random.seed(random_seed) torch.manual_seed(random_seed) # cpu torch.cuda.manual_seed(random_seed) #gpu np.random.seed(random_seed) #numpy if self.config['use_bert']: self.tokenizer = BertTokenizer.from_pretrained(self.config['bert_model_name'], cache_dir=config['bert_dir']) elif self.config['use_xlnet']: self.tokenizer = XLNetTokenizer.from_pretrained('hfl/chinese-xlnet-base', cache_dir=config['xlnet_dir']) elif self.config['use_transformer'] or self.config['use_rnn_basic_encoder']: self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir=config['bert_dir']) else: raise Exception('Not support other basic encoder') self.latest_epoch = 0 if self.config['cut_word_task'] or self.config['pos_tag_task'] or self.config['parser_task']: cws_model_path = os.path.join(self.config['ltp_path'], 'cws.model') segmentor = Segmentor() segmentor.load(cws_model_path) self.segmentor = segmentor if self.config['pos_tag_task'] or self.config['parser_task']: pos_model_path = os.path.join(self.config['ltp_path'], 'pos.model') postagger = Postagger() postagger.load(pos_model_path) self.postagger = postagger if self.config['parser_task']: parser_model_path = os.path.join(self.config['ltp_path'], 'parser.model') parser = Parser() parser.load(parser_model_path) self.parser = parser
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def parse(text): # 句法依存标注 sentences = seg2sentences(text) parser = Parser() parser.load(par_model_path) arcs_list = [] for sentence in sentences: sentence = keep_core(sentence) word_list = u.seg2words_all(sentence) words = [] for word in word_list: words.append(word.encode('utf-8')) postags = tag(words) # 两种词性标注都有各自好用的时候 ''' tag_list = u.tag(sentence) words = [] postags = [] for word, tag in tag_list: words.append(word.encode('utf-8')) postags.append(tag.encode('utf-8')) ''' arcs = parser.parse(words, postags) list = [] for i in range(len(words)): word = Word(i + 1, words[i], postags[i], arcs[i].head, arcs[i].relation) word.show() list.append(word) arcs_list.append(list) parser.release() # 释放模型 return arcs_list
def parse(words, postags): parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 #print( "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 ''' # 利用networkx绘制句法分析结果 mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['font.family'] = 'sans-serif' G = nx.Graph() # 建立无向图G # 添加节点 for word in words: G.add_node(word) G.add_node('Root') # 添加边 for i in range(len(words)): G.add_edge(words[i], heads[i]) source = '国务院' target1 = '谴责' distance1 = nx.shortest_path_length(G, source=source, target=target1) print("'%s'与'%s'在依存句法分析图中的最短距离为: %s" % (source, target1, distance1)) nx.draw(G, with_labels=True) plt.savefig("undirected_graph.png") ''' return arcs
def extract_views(all_sents): segmentor = Segmentor() segmentor.load(r'/home/student/project-01/ltp_data/cws.model') postagger = Postagger() postagger.load(r'/home/student/project-01/ltp_data/pos.model') parser = Parser() parser.load(r'/home/student/project-01/ltp_data/parser.model') views_in_sents = [] for i, sents in enumerate(all_sents): views_tmp = [] for sent in sents: sent = sent.replace('\\n', '\n').strip() if len(sent) == 0: continue # words = list(jieba.cut(sent)) words = list(segmentor.segment(sent)) contains = contain_candidates(words) if len(contains) == 0: continue tags = list(postagger.postag(words)) arcs = list(parser.parse(words, tags)) sbv, head = get_sbv_head(arcs, words, tags) if sbv[0] is None or head[0] is None or head[0] not in contains: continue subj = sbv[0] view = clean_view(words[head[1] + 1:]) views_tmp.append((subj, view, i)) if len(views_tmp) > 0: views_in_sents.append({'sents': sents, 'views': views_tmp}) segmentor.release() postagger.release() parser.release() return views_in_sents
def test_ltp(document): LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model" # ltp模型目录的路径 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(document) # 分词 print("\nA") print("分词结果:") print('\t'.join(words)) segmentor.release() # 释放模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 print("\n") print("词性标注结果:") print('\t'.join(postags)) postagger.release() # 释放模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\n") print("句法分析结果:") print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型
def dependency_parser(req): if req.method == 'POST': #print '-----------word_class_analyse START -----\r\n' intext = req.POST["intext"].encode('utf-8', 'ignore') words = segmentor(intext) tags = posttagger(words) parser = Parser() # 初始化实例 # parser.load('/usr/local/src/ltp_data/parser.model') # 加载模型 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, tags) # 句法分析 #print "依存语义分析:\n" #print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) outtext = '{"result":[' for word, arc in zip(words, arcs): outtext += '{"content":"' + "%s" % word + '",' outtext += '"head"' + ':' + "%d" % (arc.head - 1) + ',' outtext += '"relate"' + ':"' + "%s" % arc.relation + '"},' outtext = outtext.rstrip(',') + ']}' parser.release() # 释放模型 response = HttpResponse(outtext) response["Access-Control-Allow-Origin"] = "*" response["Access-Control-Allow-Methods"] = "POST" response["Access-Control-Max-Age"] = "1000" response["Access-Control-Allow-Headers"] = "*" return response
def parse(words, postags): parser = Parser() # 初始化实例 parser.load(r'D:\Corpus\ltp_data_v3.4.0\parser.model') # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 return arcs #<pyltp.VectorOfParseResult object at 0x00000000035E6630>
class LtpTree(DepTree): def __init__(self, dict_path=None): super(DepTree, self).__init__() print("正在加载LTP模型... ...") self.segmentor = Segmentor() if dict_path is None: self.segmentor.load(os.path.join(MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, "parser.model")) print("加载模型完毕。") def parse(self, sentence): self.words = self.segmentor.segment(sentence) self.postags = self.postagger.postag(self.words) self.arcs = self.parser.parse(self.words, self.postags) for i in range(len(self.words)): if self.arcs[i].head == 0: self.arcs[i].relation = "ROOT" def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def par(sentences): par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') parser = Parser() parser.load(par_model_path) parsers = parser.parse(seg(sentences), pos(sentences)) parser.release() return parsers
class Parse_Util(object): def __init__(self, lexicon_path='./data/lexicon'): # 分词 self.segmentor = Segmentor() # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) self.segmentor.load(cws_model_path) # 词性标注 self.postagger = Postagger() self.postagger.load(pos_model_path) # 依存句法分析 self.parser = Parser() self.parser.load(par_model_path) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # jieba 分词 # jieba.load_userdict(lexicon_path) def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() # 解析句子 def parse_sentence(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs) return words, postags, netags, arcs
class ModelLoader: __instance = None def __new__(cls): if cls.__instance is None: cls.__instance = super(ModelLoader, cls).__new__(cls) cls.__instance.__initialized = False return cls.__instance def __init__(self): if (self.__initialized): return self.__initialized = True LTP_DIR = "./ltp_data" #客製化分詞,並且後處理更改詞性 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, 'customized.txt')) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) self.sentenceSplitter = SentenceSplitter()
def load_all_model(): """返回分词,词性标注,命名实体识别,依存解析等实例对象""" LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, './temp_file/cut_external_dict/cut_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load_with_lexicon(pos_model_path, './temp_file/pos_external_dict/pos_external_dict') # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data' # ltp模型目录的路径 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 fname = r"E:/MYGIT/model/wiki_stopwords/wiki_word2vec.kv" # model_wv.save(fname) model_wv = KeyedVectors.load(fname, mmap='r') return [segmentor, postagger, recognizer, parser, model_wv]
def parse(s): """ 对语句进行句法分析,并返回句法结果 parse_result:依存句法解析结果 source:企业实体的词序号 target:另一个企业实体的词序号 keyword_pos:关键词词序号列表 source_dep:企业实体依存句法类型 target_dep:另一个企业实体依存句法类型 """ tmp_ner_dict = {} num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'] # 将公司代码替换为特殊称谓,保证分词词性正确 for i, ner in enumerate(list(set(re.findall(r'(ner\_\d\d\d\d\_)', s)))): try: tmp_ner_dict[num_lst[i] + '号企业'] = ner except IndexError: return None, None, None, None, None, None s = s.replace(ner, num_lst[i] + '号企业') words = segmentor.segment(s) tags = postagger.postag(words) parser = Parser() # 初始化实例 # parser.load('F:\ltp_data\parser.model') # 加载模型 parser.load('E:\ltp_data\parser.model') # 加载模型 arcs = parser.parse(words, tags) # 句法分析 arcs_lst = list(map(list, zip(*[[arc.head, arc.relation] for arc in arcs]))) # 句法分析结果输出 # parse_result = pd.DataFrame([[a, b, c, d] for a, b, c, d in zip(list(words), list(tags), arcs_lst[0], arcs_lst[1])], # index=range(1, len(words) + 1)) parse_result = pd.DataFrame(list( map(list, zip(list(words), list(tags), arcs_lst[0], arcs_lst[1]))), index=range(1, len(words) + 1)) parser.release() # 能找到两个企业以上才返回结果,目前简化,只考虑两家企业关系 try: source = list(words).index('一号企业') + 1 target = list(words).index('二号企业') + 1 source_dep = arcs_lst[1][source - 1] target_dep = arcs_lst[1][target - 1] except: return None, None, None, None, None, None # 找投资关系关键词 key_words = [ "收购", "竞拍", "转让", "扩张", "并购", "注资", "整合", "并入", "竞购", "竞买", "支付", "收购价", "收购价格", "承购", "购得", "购进", "购入", "买进", "买入", "赎买", "购销", "议购", "函购", "函售", "抛售", "售卖", "销售", "转售" ] keyword_pos = [ list(words).index(w) + 1 if w in list(words) else -1 for w in key_words ] return parse_result, source, target, keyword_pos, source_dep, target_dep
def parse(words, postags): parser = Parser() # 初始化实例 parser.load('/Users/chenming/Spyder/3.3.1/ltp_data/parser.model') # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 return arcs
def semantic_role_label(self): #依存句法分析 parser = Parser() parser.load('ltp_data/parser.model') arcs = parser.parse(self.words, self.postags) parser.release() labeller = SementicRoleLabeller() labeller.load('ltp_data/srl') roles = labeller.label(self.words, self.postags, self.netags, arcs) Label_AX = [] #存放A0或者A1标签的列表 for role in roles: Label_AX.extend([ arg for arg in role.arguments if arg.name == "A0" or arg.name == "A1" ]) for label in Label_AX: #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者 if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10: for i in range(label.range.start, label.range.end + 1): #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体 if self.postags[i] == "n" or self.postags[ i] == "ns" or self.postags[ i] == "nh" or self.postags[i] == "ni": self.entity.append(self.words[i]) else: pass else: pass labeller.release()
class LtpLanguageAnalysis(object): def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"): self.segmentor = Segmentor() self.segmentor.load(os.path.join(model_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(model_dir, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(model_dir, "parser.model")) def analyze(self, text): # 分词 words = self.segmentor.segment(text) print '\t'.join(words) # 词性标注 postags = self.postagger.postag(words) print '\t'.join(postags) # 句法分析 arcs = self.parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
class LtpParser: def __init__(self): LTP_DIR = "./ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def fun(): zi = request.args.get('words', None) zi = zi.encode('utf-8') response = urllib2.Request('http://154.8.214.203:9999/index?words='+zi) res = urllib2.urlopen(response) hjson = res.read() hjson = json.loads(hjson,'utf-8') word = [] tag = [] for i in hjson['result']: word.append(i[0].encode('utf-8')) tag.append(i[1].encode('utf-8')) LTP_DATA_DIR = '/root/ltp_data_v3.4.0' # ltp模型目录的路径 #cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` #pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') #segmentor = Segmentor() # 初始化实例 #postagger = Postagger() parser = Parser() #segmentor.load_with_lexicon(cws_model_path,'/root/pyltp/ci.txt') # 加载模型 #postagger.load(pos_model_path) parser.load(par_model_path) #words = segmentor.segment(zi) # 分词 #postags = postagger.postag(words) arcs = parser.parse(word,tag) #w=[] #for x in words: #w.append(x) #p=[] #for v in postags: #p.append(v) a = [] for pr in arcs: zi = {} zi['head']=pr.head zi['relation']=pr.relation a.append(zi) #print type(arcs) #print '\t'.join(words) #print '\t'.join(postags) #print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) for k,i in enumerate(a): i['words']=word[k] i['tag']=tag[k] #segmentor.release() # 释放模型 #postagger.release() parser.release() #print i #result = {'result':a} return json.dumps(a,ensure_ascii=False)
def load_dependency_model(): """ 加载依存句法分析模型 """ par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析 parser = Parser() parser.load(par_model_path) return parser
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join(