def get_postags(self, words): postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) postagger.release() # 释放模型 return list(postags)
def get_all_name(r_filename,w_file): # global nlp LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径 # 分词 segmentor = Segmentor() # 初始化 segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型 # 词性标注 postagger = Postagger() # 初始化 postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型 # 命名实体识别 recognizer = NamedEntityRecognizer() # 实例化 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) f_r=open(r_filename,"r",encoding="utf-8") f_w=open(w_file,"w",encoding="utf-8") count=0 for line in f_r: count+=1 lines=line.strip("\n").replace(r"\n","") # print("----------"+lines) words = segmentor.segment(lines) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) sen=get_some_idea(line,netags,words) print(sen) if sen: for key in sen: sens="\t".join(list(set([data[1] for data in sen[key]]))) f_w.write(key +"\t"+sens +"\n") # nlp.close() f_r.close() f_w.close()
def ltp_pos_data(): """使用 LTP 进行词性标注""" LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 result = [] file = [(const.qc_train_seg, const.qc_train_pos), (const.qc_test_seg, const.qc_test_pos)] for i in range(2): with open(file[i][0], 'r', encoding='utf-8') as f: for line in f.readlines(): attr = line.strip().split('\t') words = attr[1].split(" ") words_pos = postagger.postag(words) res = ' '.join([ "{}/_{}".format(words[i], words_pos[i]) for i in range(len(words)) ]) result.append("{}\t{}\n".format(attr[0], res)) with open(file[i][1], 'w', encoding='utf-8') as f: f.writelines(result) result.clear() postagger.release() # 释放模型
class pyltp_model(): def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'): cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.postagger = Postagger() # 初始化实例 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger.load(pos_model_path) # 加载模型 self.recognizer.load(ner_model_path) # 加载模型 def token(self, sentence): words = self.segmentor.segment(sentence) # 分词 words = list(words) postags = self.postagger.postag(words) # 词性标注 postags = list(postags) netags = self.recognizer.recognize(words, postags) # 命名实体识别 netags = list(netags) result = [] for i, j in zip(words, netags): if j in ['S-Nh', 'S-Ni', 'S-Ns']: result.append(j) continue result.append(i) return result def close(self): self.segmentor.release() self.postagger.release() self.recognizer.release() # 释放模型
def run(): #分词+选词 cont = open('key/pinglun_filter_all1.txt','r',encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 nwordall = [] for sentence in cont: nword = [''] words = segmentor.segment(sentence) # 分词 #默认可以这样输出 # print (' '.join(words)) postags = postagger.postag(words) # 词性标注 for word,tag in zip(words,postags): #############选择词性输出 # print (word+'/'+tag) ############只选出副词 # if tag == 'd': #######过滤单个字 # if((tag == 'n'or tag == 'd' or tag == 'a') and len(word)>1): ############使用word2vec相似度计算找取跟名词相近的形容词 # if((tag == 'a' or tag == 'n') and len(word)>1): if((tag == 'n') and len(word)>1): # print(word+tag) nword.append(word) nwordall.append(nword) #size为词向量维度数也即是特征值,windows窗口范围,min_count频数小于5的词忽略,workers是线程数,维度高会造成问题 model = models.word2vec.Word2Vec(nwordall, size=10, window=5, min_count=100, workers=80) print('#############################################') sim = model.most_similar(positive=[u'餐饮']) for s in sim: print ("word:%s,similar:%s " %(s[0],s[1]))
def get_postag_list(words_list): postag = Postagger() postag.load(pos_model_path) postag_list = list(postag.postag(words_list)) postag.release() return postag_list
class pyltp_impl(Seg): def __init__(self, dictpath, mode='seg'): super().__init__(mode) from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer self.ltp_seg = Segmentor() self.ltp_pos = Postagger() self.ltp_ner = NamedEntityRecognizer() self.ltp_seg.load(os.path.join(dictpath, 'cws.model')) if mode != 'seg': self.ltp_pos.load(os.path.join(dictpath, 'pos.model')) if mode == 'ner': self.ltp_ner.load(os.path.join(dictpath, 'ner.model')) def impl_func(self, sentence): seg_res = self.ltp_seg.segment(sentence) if self.mode == 'seg': return seg_res pos_res = self.ltp_pos.postag(seg_res) if self.mode == 'postag': return [(word, tag) for (word, tag) in zip(seg_res, pos_res)] ner_res = self.ltp_ner.recognize(seg_res, pos_res) return [(word, tag) for (word, tag) in zip(seg_res, ner_res)]
def ner_data(): # 分词模型 segmentor = Segmentor() segmentor.load('cws.model') # 词性标注模型 postagger = Postagger() postagger.load('pos.model') # 命名实体模型 recognizer = NamedEntityRecognizer() NamedEntityRecognizer.load('ner.model') # 加载将要被分词的数据 data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig') datas = data_csv['title'] util = Utils() data_processed = open('./data_processed_recognizer.csv', 'w', encoding='utf-8') for data in datas: words = segmentor.segment(data) postags = postagger.postag(words) word_split = ' '.join(words).split(' ') netags = recognizer.recognize(words, postags) netag_split = ' '.join(netags).split(' ') concat_word = util.concat(word_split, netag_split, tag='netags') data_processed.write(concat_word + '\n') data_processed.close()
def test_ltp(document): LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model" # ltp模型目录的路径 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(document) # 分词 print("\nA") print("分词结果:") print('\t'.join(words)) segmentor.release() # 释放模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 print("\n") print("词性标注结果:") print('\t'.join(postags)) postagger.release() # 释放模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\n") print("句法分析结果:") print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型
class Parse_Util(object): def __init__(self, lexicon_path='./data/lexicon'): # 分词 self.segmentor = Segmentor() # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) self.segmentor.load(cws_model_path) # 词性标注 self.postagger = Postagger() self.postagger.load(pos_model_path) # 依存句法分析 self.parser = Parser() self.parser.load(par_model_path) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # jieba 分词 # jieba.load_userdict(lexicon_path) def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() # 解析句子 def parse_sentence(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs) return words, postags, netags, arcs
class ModelLoader: __instance = None def __new__(cls): if cls.__instance is None: cls.__instance = super(ModelLoader, cls).__new__(cls) cls.__instance.__initialized = False return cls.__instance def __init__(self): if (self.__initialized): return self.__initialized = True LTP_DIR = "./ltp_data" #客製化分詞,並且後處理更改詞性 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, 'customized.txt')) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) self.sentenceSplitter = SentenceSplitter()
def __init__(self, config): self.config = config random_seed = config['random_seed'] random.seed(random_seed) torch.manual_seed(random_seed) # cpu torch.cuda.manual_seed(random_seed) #gpu np.random.seed(random_seed) #numpy if self.config['use_bert']: self.tokenizer = BertTokenizer.from_pretrained(self.config['bert_model_name'], cache_dir=config['bert_dir']) elif self.config['use_xlnet']: self.tokenizer = XLNetTokenizer.from_pretrained('hfl/chinese-xlnet-base', cache_dir=config['xlnet_dir']) elif self.config['use_transformer'] or self.config['use_rnn_basic_encoder']: self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir=config['bert_dir']) else: raise Exception('Not support other basic encoder') self.latest_epoch = 0 if self.config['cut_word_task'] or self.config['pos_tag_task'] or self.config['parser_task']: cws_model_path = os.path.join(self.config['ltp_path'], 'cws.model') segmentor = Segmentor() segmentor.load(cws_model_path) self.segmentor = segmentor if self.config['pos_tag_task'] or self.config['parser_task']: pos_model_path = os.path.join(self.config['ltp_path'], 'pos.model') postagger = Postagger() postagger.load(pos_model_path) self.postagger = postagger if self.config['parser_task']: parser_model_path = os.path.join(self.config['ltp_path'], 'parser.model') parser = Parser() parser.load(parser_model_path) self.parser = parser
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
class Ltp(NerModel): def __init__(self): super(Ltp, self).__init__() self._model_path = "./model/ltp/" self._seg = Segmentor() self._pos = Postagger() self._recognizer = NamedEntityRecognizer() self._load_model() self._object_str = "[INFO] This is ltp object!" print("[INFO] All model is load!") def __repr__(self): return self._object_str def _load_model(self): self._seg.load(self._model_path + "cws.model") self._pos.load(self._model_path + "pos.model") self._recognizer.load(self._model_path + "ner.model") def get_entity(self, sentence): words = self._seg.segment(sentence) pos = self._pos.postag(words) ner = self._recognizer.recognize(words, pos) entity = [w for w, s in zip(words, ner) if s != 'O'] if entity: return "".join(entity) if len(entity) > 1 else entity[0]
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def word_pos(): #ltp词性标注 candidate=pd.read_csv(r'../data/candidate_sentiment.csv',header=None) can_word=candidate[0].tolist() # 新加一列存放词性 candidate.insert(2,'ltp_pos',0) candidate.insert(3,'jieba_pos',0) candidate.columns=['word','freq','ltp_pos','jieba_pos'] LTP_DATA_DIR = '../ltp_data_v3.4.0/ltp_data_v3.4.0' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(can_word) # 词性标注 postagger.release() # 释放模型 postags=list(postags) candidate['ltp_pos']=postags #jieba词性标注 jieba_pos=[] for index,row in candidate.iterrows(): s=row['word'] words=pseg.cut(s) pos=[] for w in words: pos.append(w.flag) pos=' '.join(pos) jieba_pos.append(pos) candidate['jieba_pos']=jieba_pos # 添加表头 candidate.to_csv(r'../data/candidate_sentiment.csv',index=None)
def locationNER(text): #先分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 #print ('\t'.join(words)) segmentor.release() #再词性标注 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 postagger.release() # 释放模型 #最后地理实体识别 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for i in range (0,len(netags)): if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]: results.append(words[i-1]+words[i]+words[i+1]) if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]: results.append(words[i]) return results
def cut_words(): #分词+去除空行 #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html cont = open('resource_new.txt', 'r', encoding='utf-8') f = open('key/cut_resouce.txt', 'w', encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('module/cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('module/pos.model') # 加载模型 for sentence in cont: if sentence.strip() != '': words = segmentor.segment(sentence) # 分词 pos_tags = postagger.postag(words) # 词性标注 for word, tag in zip(words, pos_tags): if tag != 'wp': f.write(word) else: f.write('\n') f.write('\n') else: continue f.close() segmentor.release() postagger.release()
def segmentsentence(sentence): segmentor = Segmentor() postagger = Postagger() parser = Parser() recognizer = NamedEntityRecognizer() segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model") postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model") # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model") recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model") ############# word_list = segmentor.segment(sentence) postags_list = postagger.postag(word_list) nertags = recognizer.recognize(word_list, postags_list) ############ for word, ntag in zip(word_list, nertags): if ntag == 'Nh': entity_list.append(word) print(" ".join(word_list)) print(' '.join(nertags)) ############ segmentor.release() postagger.release() # parser.release() recognizer.release() return word_list
def postags_opt(words): # Set pyltp postagger model path LTP_DATA_DIR = '../ltp_data_v3.4.0' pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # Init postagger postagger = Postagger() # Load model postagger.load(pos_model_path) # Get postags postags = postagger.postag(words) # Close postagger postagger.release() postags = list(postags) # Init result list saying_words = [] # Filter with tag 'verb' for index, tag in enumerate(postags): if tag == 'v': saying_words.append(words[index]) return saying_words
def extract_views(all_sents): segmentor = Segmentor() segmentor.load(r'/home/student/project-01/ltp_data/cws.model') postagger = Postagger() postagger.load(r'/home/student/project-01/ltp_data/pos.model') parser = Parser() parser.load(r'/home/student/project-01/ltp_data/parser.model') views_in_sents = [] for i, sents in enumerate(all_sents): views_tmp = [] for sent in sents: sent = sent.replace('\\n', '\n').strip() if len(sent) == 0: continue # words = list(jieba.cut(sent)) words = list(segmentor.segment(sent)) contains = contain_candidates(words) if len(contains) == 0: continue tags = list(postagger.postag(words)) arcs = list(parser.parse(words, tags)) sbv, head = get_sbv_head(arcs, words, tags) if sbv[0] is None or head[0] is None or head[0] not in contains: continue subj = sbv[0] view = clean_view(words[head[1] + 1:]) views_tmp.append((subj, view, i)) if len(views_tmp) > 0: views_in_sents.append({'sents': sents, 'views': views_tmp}) segmentor.release() postagger.release() parser.release() return views_in_sents
def _load_testset(self): """ 加载测试集 :return: """ par_model_path = os.path.join(self.ltp_dir, 'parser.model') pos_model_path = os.path.join(self.ltp_dir, 'pos.model') postagger = Postagger() postagger.load(pos_model_path) parser = Parser() parser.load(par_model_path) examples = [] with open(os.path.join(self.data_dir, self.file_name)) as f: for l in tqdm(f): l = json.loads(l) # 分词 pos ner : 中文命名实体识别是字符级模型(bert),所以用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。 text_seg = jieba.lcut(l['text'], HMM=False) poses = ' '.join(postagger.postag(text_seg)).split() arcs = parser.parse(text_seg, poses) arcses = ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs).split() examples.append( self.align_bert_4_inference(l, text_seg, arcses)) return examples
class LtpTree(DepTree): def __init__(self, dict_path=None): super(DepTree, self).__init__() print("正在加载LTP模型... ...") self.segmentor = Segmentor() if dict_path is None: self.segmentor.load(os.path.join(MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, "parser.model")) print("加载模型完毕。") def parse(self, sentence): self.words = self.segmentor.segment(sentence) self.postags = self.postagger.postag(self.words) self.arcs = self.parser.parse(self.words, self.postags) for i in range(len(self.words)): if self.arcs[i].head == 0: self.arcs[i].relation = "ROOT" def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def get_postag_list(self, word_list, model): # 得到词性标注 postag = Postagger() postag.load(model) postag_list = list(postag.postag(word_list)) postag.release() return postag_list
def init_pyltp(model_dir, dict_file=None): ''' 初始化Pyltp的几个模块 :param model_dir 模型的路径 :param dict_file 分词的外部词典 :return segmentor, postagger, parser, ner ''' segmentor = Segmentor() postagger = Postagger() parser = Parser() ner = NamedEntityRecognizer() cws_model = os.path.join(model_dir, 'cws.model') pos_model = os.path.join(model_dir, 'pos.model') parser_model = os.path.join(model_dir, 'parser.model') ner_model = os.path.join(model_dir, 'ner.model') if dict_file: segmentor.load_with_lexicon(cws_model, dict_file) else: segmentor.load(cws_model) postagger.load(pos_model) ner.load(ner_model) parser.load(parser_model) return segmentor, postagger, parser, ner
class LtpLanguageAnalysis(object): def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"): self.segmentor = Segmentor() self.segmentor.load(os.path.join(model_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(model_dir, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(model_dir, "parser.model")) def analyze(self, text): # 分词 words = self.segmentor.segment(text) print '\t'.join(words) # 词性标注 postags = self.postagger.postag(words) print '\t'.join(postags) # 句法分析 arcs = self.parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def ltp_word(self): """创建一个方法,用来进行句子的分词、词性分析等处理。""" # 分词 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(self.content) #print("*************分词*****************") #print("\t".join(words)) # 词性标注 postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) #print("*************词性标注*************") #print(type(postags)) #print("\t".join(postags)) # 依存句法分析 parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) #print("*************依存句法分析*************") #print(type(arcs)) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 把依存句法分析结果的head和relation分离出来 arcs_head = [] arcs_relation = [] for arc in arcs: arcs_head.append(arc.head) arcs_relation.append(arc.relation) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) #print("*************命名实体识别*************") #print("\t".join(netags)) """ # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "pisrl.model")) roles = labeller.label(words, postags, arcs) print("*************语义角色标注*************") for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) """ segmentor.release() postagger.release() parser.release() recognizer.release() #labeller.release() # 调用list_conversion函数,把处理结果列表化 words_result = list_conversion(words, postags, netags, arcs_head, arcs_relation) return words_result
class LtpParser: def __init__(self): LTP_DIR = "./ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def main(): segmentor = Segmentor() segmentor.load('./cws.model') postagger = Postagger() postagger.load('./pos.model') file_object = open(sys.argv[1], 'r') sid = [] output_list = [] try: all_lines = file_object.readlines() lc = 0 tot = 0 for line in all_lines: output = [] lc += 1 item = line.split('\t') sid.append(item[0]) sentence = item[1][0:-1] #print sentence.decode('utf-8') if (len(sentence.replace(' ', '')) != len(sentence)): tot += 1 print lc sentence = sentence.replace(' ', '') word = segmentor.segment(sentence.encode('utf-8')) pos = postagger.postag(word) tag = [] word = list(word) pos = list(pos) for i in range(len(word)): word[i] = word[i].decode('utf-8') pos[i] = pos[i].decode('utf-8') word, pos = wordToChar(word, pos) for i in range(len(word)): tag.append('O') for i in range(len(word)): output.append(word[i] + ' ' + pos[i] + ' ' + tag[i] + '\n') output.append('\n') output_list.append(output) print tot finally: file_object.close() file_object = open(sys.argv[2], 'w') negative_num = 0 for i in range(len(output_list)): ff = 0 for j in range(len(output_list[i])): output_list[i][j].encode('utf-8') if (output_list[i][j] != '\n' and output_list[i][j].split(' ')[2][0] != 'O'): ff = 1 file_object.write(output_list[i][j]) if (ff == 0): negative_num += 1 print negative_num file_object = open('SID.txt', 'w') for i in range(len(sid)): file_object.write(sid[i] + '\n') file_object.close()
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def words_cixing(words=["中国","进出口","银行","与","中国银行","加强","合作"],type_list=0,pos=0): """词性标注,若type_list=True,则返回以列表返回标注词性后的结果。 词性标记集:LTP中采用863词性标注集 词性说明见:http://www.ltp-cloud.com/intro/ 若type_list为真,则返回['ns', 'v', 'n', 'c', 'ni', 'v', 'v'] 若pos为真,则返回['中国/ns', '进出口/v', '银行/n', '与/c', '中国银行/ni', '加强/v', '合作/v'] 默认返回是生成器列表 """ if type(words)==str: words=split_words(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) if type_list : return [i for i in postags] if pos: return ['{}/{}'.format(k,v)for k,v in zip(words,[i for i in postags])] return postags
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
def main(): f = open("psgs.txt", "r") lines = [line.rstrip() for line in f.readlines()] f.close() segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) f = open("../questions/q_facts_segged_clf.txt", "r") types = f.readlines() f.close() f = open("../questions/provided/q_facts.txt", "r") questions = [line.rstrip() for line in f.readlines()] f.close() f = open("psgs_segged.txt", "w") fans = open("zhidao_answer.txt", "w") i = 0 qid = 0 flag = 0 while i < len(lines): line = lines[i] if (i % 50000 == 0): print "\r#\t%d" % i, sys.stdout.flush() if line.startswith("<question"): qid = int(line.split(" ")[1].split("=")[1].split(">")[0]) flag = 0 f.write(line + "\n") elif line.startswith("</doc") or line.startswith("</question"): f.write(line + "\n") elif line.startswith("<doc"): f.write(line + "\n" + lines[i+1] + "\n") i += 2 else: L = len(line) s = 0 for s in range(L): if line[s:].startswith("最佳答案:") \ or line[s:].startswith("[专业]答案")\ or line[s:].startswith("、"+questions[qid-1]): break if line[s:].startswith("最佳答案"): s += 14 elif line[s:].startswith("[专业]答案"): s += 15 elif line[s:].startswith("、"+questions[qid-1]): s += len(questions[qid-1])+1 if s < L and flag == 0: t = s + 1 while t < L and line[t:].startswith("更多") == False\ and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\ and not line[t:].startswith("~")\ and not line[t:].startswith("?")\ and not line[t:].startswith("!")\ and not line[t:].startswith("。"): t += 1 if s < t and t-s < 200 and t-s > 1: ans = line[s:t].rstrip(".。 ??,,") if types[qid-1].rstrip() == "Q_number": ans = first_con_number(ans) fans.write("%d\t%s\n" % (qid, ans)) flag = 1 # words = segmentor.segment(line) # postags = postagger.postag(words) # for j in range(len(words)): # f.write("%s/%s\t" % (words[j], postags[j])) # f.write("\n") i += 1 f.close() fans.close()
# Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags)