def __init__(self, articles_filename='articles.csv', record_filename='record.csv', rule_reference_filename='rule_reference.txt', LTP_DIR="ltp_data_v3.4.0/", filter_dictionary=['有限公司']): self.articles_filename = articles_filename self.record_filename = record_filename self.rule_reference_filename = rule_reference_filename ###############################加载ltp相关模型######################################### self.LTP_DIR = LTP_DIR #分词模型 self.segmentor = pyltp.Segmentor() self.segmentor.load(os.path.join(self.LTP_DIR, "cws.model")) #词性模型 self.postagger = pyltp.Postagger() self.postagger.load(os.path.join(self.LTP_DIR, 'pos.model')) #命名实体模型 self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(os.path.join(self.LTP_DIR, 'ner.model')) #依存句法分析 self.parser = pyltp.Parser() self.parser.load(os.path.join(self.LTP_DIR, 'parser.model')) self.filter_dictionary = filter_dictionary self.left_postags_dict = {} self.left_word_dict = {} self.mid_postags_dict = {} self.mid_word_dict = {} self.right_postags_dict = {} self.right_word_dict = {} self.CMP_dict = {} self.SBV_dict = {} self.VOB_dict = {}
def build_files(self): """ 遍历原始文档,进行分词词性标注,去除停用词等,创建FileItem类集合 """ files = [] category_id = 0 segmentor = pyltp.Segmentor() segmentor.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\cws.hyp') postagger = pyltp.Postagger() postagger.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\pos.hyp') parser = pyltp.Parser() parser.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\parser.hyp') for ids, path in enumerate(self.file_paths()): with open(path, 'r', encoding='utf-8') as f: try: category = self.path2category(path) if category not in self.category_ids: self.category_ids[category] = category_id category_id += 1 raw = self.process_line(f.read()) words = self.remove_stop_words(list( segmentor.segment(raw))) words = self.clean_specific(words) pos = list(postagger.postag(words)) parse_result = list(parser.parse(words, pos)) files.append( FileItem(ids, category, words, pos, parse_result)) except UnicodeDecodeError: logging.warning(path + ' UTF-8解码失败,请检查文本格式') continue segmentor.release() postagger.release() parser.release() return files
def __init__(self, model_dir_path, blacklist_path): ''' model_dir_path: pyltp 模型文件路径 blacklist_path: 黑名单文件路径 ''' # 初始化相关模型文件路径 self.model_dir_path = model_dir_path self.cws_model_path = os.path.join( self.model_dir_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join( self.model_dir_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join( self.model_dir_path, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` # 初始化分词模型 self.segmentor = pyltp.Segmentor() self.segmentor.load(self.cws_model_path) # 初始化词性标注模型 self.postagger = pyltp.Postagger() self.postagger.load(self.pos_model_path) # 初始化NER模型 self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 初始化公司名黑名单 self.com_blacklist = set() with open(blacklist_path, 'r', encoding='utf-8') as f_com_blacklist: for line in f_com_blacklist: if len(line.strip()) > 0: self.com_blacklist.add(line.strip())
def __init__(self): jieba.initialize() self.ltpseg = pyltp.Segmentor() self.ltpseg.load('model/ltp_data_v3.4.0/cws.model') jiagu.init() self.thu1 = thulac.thulac(seg_only=True) pynlpir.open()
def word_segmentation(sentence): cws_ = pyltp.Segmentor() cws_.load(cws_model_path) words = cws_.segment(sentence) print('\t'.join(words)) cws_.release() return words
def cut_words(sentences): segmentor = pyltp.Segmentor() segmentor.load(ltp_path + 'cws.model') words = [ word for sentence in sentences for word in segmentor.segment(sentence) ] segmentor.release() return words
def __init__(self): self.path = '../data/' self.unprocessed_data = [] self.attribute = ['dis', 'bod', 'sym', 'dec', 'fre', 'ite'] self.dics = {} self.processed_data = [] self.segmentor = pp.Segmentor() self.segmentor.load("ltp_data_v3.4.0/cws.model") # 加载分词模型
def __init__(self): model_path = '/home/lnn/Documents/postag/ltp_data_v3.4.0/' self.seg = pyltp.Segmentor() self.seg.load(model_path + 'cws.model') self.pos = pyltp.Postagger() self.pos.load(model_path + 'pos.model') self.parser=pyltp.Parser() self.parser.load(model_path+'parser.model')
def segment(sentence): global segmentor_ if segmentor_ is None: segmentor_ = pyltp.Segmentor() #segmentor_.load(ltp_models['cws']) # 加载模型,第二个参数是您的外部词典文件路径 segmentor_.load_with_lexicon(ltp_models['cws'], personal_seg_dict) return segmentor_.segment(sentence)
def ltpSetup(): LTP_DATA_DIR = './ltp_data_v3.4.0/' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') segmentor = pyltp.Segmentor() segmentor.load(cws_model_path) postagger = pyltp.Postagger() postagger.load(pos_model_path) return segmentor, postagger
def load_model(): ''' #载入分词模型和词性标注模型 ''' segmentor = pyltp.Segmentor() segmentor.load("./ltp_data/cws.model") postagger = pyltp.Postagger() postagger.load("./ltp_data/pos.model") return segmentor, postagger
def cut_words(words): segmentor = pyltp.Segmentor() seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor.load(seg_model_path) words = segmentor.segment(words) array_str="|".join(words) array=array_str.split("|") segmentor.release() return array
def Segmentor(self, sent): if self.__segmentor is None: self.__segmentor = pyltp.Segmentor() if self.__seg_lexicon_path is None: self.__segmentor.load(self.__seg_model_path) else: self.__segmentor.load_with_lexicon(self.__seg_model_path, self.__seg_lexicon_path) words = self.__segmentor.segment(sent) return words
def cut_words(words): segmentor = pyltp.Segmentor() seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor.load_with_lexicon(seg_model_path, 'D:\KG\KGQA_HLM-master\KGQA\dict.txt') words = segmentor.segment(words) array_str = "|".join(words) array = array_str.split("|") segmentor.release() return array
def __init__(self, config_lib="ltp", config_dict=None, config_stop=None, config_dir=None, seg_out_list=False): self.input_type = str self.config_dir = config_dir if config_dir is None: self.config_dir = 'E:/Data/' if 'windows' in platform.architecture( )[1].lower() else '/users/fanzfeng/Data/' self.stop_config = False if config_stop is not None and isinstance( config_stop, str) and os.path.exists(config_stop): self.stop_config = True with open(config_stop, "r", encoding="utf-8") as fp: self.stop_words = [ k.strip() for k in fp.readlines() if len(k.strip()) > 0 ] elif isinstance(config_stop, (list, tuple, set)) and len(config_stop) > 0: self.stop_config = True self.stop_words = config_stop self.all_cut = False self.seg_out_list = seg_out_list self.config_lib = config_lib if config_lib == "jieba": self.jieba_ner = "nr ns nt m".split() if config_dict is not None and isinstance( config_dict, str) and os.path.exists(config_dict): jieba.load_userdict(config_dict) self.seg = jieba.cut self.pos_seg = pseg.cut elif config_lib == "ltp": import pyltp self.segmentor = pyltp.Segmentor() if config_dict is not None and isinstance( config_dict, str) and os.path.exists(config_dict): self.segmentor.load_with_lexicon( os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model"), config_dict) else: self.segmentor.load( os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model")) self.seg = self.segmentor.segment self.postagger = pyltp.Postagger() self.text_splitter = pyltp.SentenceSplitter.split self.postagger.load( os.path.join(self.config_dir, "ltp_data_v3.4.0/pos.model")) self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(self.config_dir + "ltp_data_v3.4.0/ner.model")
def cut_words(words): user_dict = 'ds_dict.txt' segmentor = pyltp.Segmentor() # 初始化实例 seg_model_path = os.path.join(LTP_DATA_DIR, user_dict) segmentor.load(seg_model_path) # 加载模型 words = segmentor.segment(words) print(words) array_str = "|".join(words) print(array_str) array = array_str.split("|") segmentor.release() return array
def do_seg(self, intxt): if self.segmentor is None: self.segmentor = pyltp.Segmentor() if self.debug: load_start = default_timer() # 这里可以自定义词典的 外部词典本身是一个文本文件(plain text),每行指定一个词,编码同样须为 UTF-8 #self.segmentor.load_with_lexicon('模型地址, '用户字典') # 加载模型 self.segmentor.load(os.path.join(self.model_dir, 'cws.model')) if self.debug: load_use = default_timer() - load_start self.loger.debug("load cws.model use [ %f ] s" % load_use) words = self.segmentor.segment(intxt) return list(words)
def sentence( articles: List[Dict], project: Path = os.getcwd(), ltp_dir=os.path.abspath(os.path.join(os.path.realpath(__file__), "../..")) + '/ltp_data' ) -> List[Dict]: logger = hlogger(project) start_time = datetime.datetime.now() logger.info('Starting to process sentences') # 加载ltp相关模型 # 分词模型 segmentor = pyltp.Segmentor() segmentor.load(os.path.join(ltp_dir, "cws.model")) # 词性模型 postagger = pyltp.Postagger() postagger.load(os.path.join(ltp_dir, 'pos.model')) # 命名实体模型 recognizer = pyltp.NamedEntityRecognizer() recognizer.load(os.path.join(ltp_dir, 'ner.model')) if_force = False # 如果存在词表,强制实体标注加载词表 if os.path.exists(project + '/lexicon'): logger.info('Ner will use lexicon') if_force = force_segmentor() if_force.load(project + '/lexicon') logger.info('Processing sentences') results = [] for article in articles: result = extract_information(article['id'], article['content'], segmentor, postagger, recognizer, if_force) results.extend(result) length = len(results) end_time = datetime.datetime.now() logger.info( 'Sentences have been processed successfully,and there are %s sentences' % len(results)) logger.info('FINISHED! using time : %s\n' % get_time( (end_time - start_time).seconds)) return results
def __init__(self, *args, **kwargs): self.__LTP_DATA_DIR = 'D:\\NLP\\ltp_data' self.__cws_model_path = os.path.join(self.__LTP_DATA_DIR, 'cws.model') self.__pos_model_path = os.path.join(self.__LTP_DATA_DIR, 'pos.model') self.__par_model_path = os.path.join(self.__LTP_DATA_DIR, 'parser.model') self.segmentor = pyltp.Segmentor() self.segmentor.load_with_lexicon(self.__cws_model_path, './../data/word_dict.txt') self.postagger = pyltp.Postagger() self.postagger.load(self.__pos_model_path) self.parser = pyltp.Parser() self.parser.load(self.__par_model_path) self.tags_dict = {}
def __init__(self): self.path = 'ltp_data_v3.4.0/' # 下载地址 https://ltp.ai/download.html 3.4.0 self.segmentor = pp.Segmentor() self.segmentor.load(self.path + "cws.model") # 加载分词模型 self.postagger = pp.Postagger() self.postagger.load(self.path + "pos.model") # 加载词性标注模型 self.recognizer = pp.NamedEntityRecognizer() self.recognizer.load(self.path + "ner.model") # 加载命名实体识别模型 self.parser = pp.Parser() self.parser.load(self.path + "parser.model") # 加载依存句法分析模型 self.labeller = pp.SementicRoleLabeller() self.labeller.load(self.path + "pisrl.model") # 加载语义角色标注模型
def __init__(self, ltp_path, dependency=False): self.dependency = dependency cws_model_path = os.path.join(ltp_path, 'cws.model') pos_model_path = os.path.join(ltp_path, 'pos.model') ner_model_path = os.path.join(ltp_path, 'ner.model') dp_model_path = os.path.join(ltp_path, 'parser.model') self.seg = pyltp.Segmentor() self.pos = pyltp.Postagger() self.ner = pyltp.NamedEntityRecognizer() # self.srl = pyltp.SementicRoleLabeller() self.seg.load(cws_model_path) self.pos.load(pos_model_path) self.ner.load(ner_model_path) # self.srl.load(srl_model_path) if dependency: self.dp = pyltp.Parser() self.dp.load(dp_model_path)
def __init__(self, seg_model_path = 'ltp_data_v3/ltp_data_v3.4.0/cws.model', seg_lexicon_path = 'lexicon/lexicon_test', pos_model_path = 'ltp_data_v3/ltp_data_v3.4.0/pos.model', rec_model_path = 'ltp_data_v3/ltp_data_v3.4.0/ner.model', par_model_path = 'ltp_data_v3/ltp_data_v3.4.0/parser.model'): self.seg_lexicon_path = seg_lexicon_path self.segmentor = pyltp.Segmentor() self.seg_model_path = seg_model_path self.segmentor.load_with_lexicon(self.seg_model_path,self.seg_lexicon_path) self.postagger = pyltp.Postagger() self.pos_model_path = pos_model_path self.postagger.load(self.pos_model_path) self.recognizer = pyltp.NamedEntityRecognizer() self.rec_model_path = rec_model_path self.recognizer.load(rec_model_path) self.parser = pyltp.Parser() self.par_model_path = par_model_path self.parser.load(self.par_model_path)
def replace(self, input_path, output_path, replace_key, pattern): with open(input_path, "r") as input_file: with open(output_path, "w") as output_file: # print(input_file.read()) for line in input_file.readlines(): # 去掉回车 line = line.strip() # ltp分词 segmentor = pyltp.Segmentor() segmentor.load_with_lexicon( "../utils/ltp/models/cws.model", "lexicon") words = list(segmentor.segment(line)) segmentor.release() # 合并分词结果 line = " ".join(words) # 替换关键词 line = line.replace(replace_key, "*") # 生成aiml line = pattern.replace("模板句子", line) output_file.write(line)
def __init__(self): #model_file = './data/news_12g_baidubaike_20g_novel_90g_embedding_64.bin' self.model_file = './data/sgns.baidubaike.bigram-char' self.model = gensim.models.KeyedVectors.load_word2vec_format( self.model_file, binary=False) #model = gensim.models.Word2Vec.load(model_file) self.LTP_DATA_DIR = './data/ltp_data_v3.4.0/' # ltp模型目录的路径 self.cws_model_path = os.path.join( self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.segmentor = ltp.Segmentor() self.segmentor.load(self.cws_model_path) self.stopwords = [ line.strip() for line in open('./data/哈工大停用词表.txt', encoding='UTF-8').readlines() ] with open('./data/wordlist.pkl', 'rb') as f: self.word_list = pickle.load(f) self.q_list, self.a_list = self.read_corpus() print('sim model init finished ......') return
def ltp_init(self): import pyltp LTP_DATA_DIR = '/nas/data/m1/panx2/lib/ltp/ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') self.model_ltp_splitter = pyltp.SentenceSplitter() self.model_ltp_segmentor = pyltp.Segmentor() self.model_ltp_segmentor.load(cws_model_path) self.model_ltp_postagger = pyltp.Postagger() self.model_ltp_postagger.load(pos_model_path) self.model_ltp_recognizer = pyltp.NamedEntityRecognizer() self.model_ltp_recognizer.load(ner_model_path) self.model_ltp_dparser = pyltp.Parser() self.model_ltp_dparser.load(par_model_path) self.parse = self._parse self.sent_seger = self.ltp_sent_seger self.tokenizer = self.ltp_tokenizer self.processor = self.ltp_processor
def _model_initialize(self): if self.__segmentor == None: self.__segmentor = pyltp.Segmentor() if self.__seg_lexicon_path == None: self.__segmentor.load(self.__seg_model_path) else: self.__segmentor.load_with_lexicon(self.__seg_model_path, self.__seg_lexicon_path) if self.__postagger == None: self.__postagger = pyltp.Postagger() if self.__seg_lexicon_path == None: self.__postagger.load(self.__pos_model_path) else: self.__postagger.load_with_lexicon(self.__pos_model_path, self.__seg_lexicon_path) if self.__recognizer == None: self.__recognizer = pyltp.NamedEntityRecognizer() self.__recognizer.load(self.__rec_model_path) if self.__parser == None: self.__parser = pyltp.Parser() self.__parser.load(self.__par_model_path)
import json import http.client, urllib.parse import pyltp from mysite.language_detect import load_ngram_dict, sentence_to_feature language_list = ['Zh', 'En', 'Ug'] max_n = 4 language_ngram_dict = {} for language in language_list: language_ngram_dict[language] = load_ngram_dict( "/home/xwshi/PolarlionSite/PolarlionMT/mysite/static/gram/%s" % language) # filename = "/home/xwshi/PolarlionSite/PolarlionMT/mysite/static/gram/%s" % language # language_ngram_dict[language] = load_ngram_dict(filename) ltp_segmentor = pyltp.Segmentor() ltp_segmentor.load("/home/xwshi/tools/ltp_data_v3.4.0/cws.model") # import requests # import urllib2 # import commands model_id = { "transformer": { "en-zh": 100, "zh-en": 101, "ug-zh": 104 }, # 't2t-bigcorpus':{'zh-en':102}, 'lstm': { 'zh-en': 103 }
def __init__(self, segmentor=default_segmentor): self.segmentor = segmentor if self.segmentor == "pyltp": self.ltp_segmentor = pyltp.Segmentor() self.ltp_segmentor.load(cws_model_path)
def ltp_process(sentence): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./cws.model") words = segmentor.segment(sentence) print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./parser.model") arcs = parser.parse(words, postags) parser.release() # 角色分析,暂时没用上 # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 for arc in arcs: #SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 Subject_label_set.append( words[Index_of_Subjet]) # 如果有SBV,那么这个词对应的位置肯定是主语 Word_of_speech_content.append( words[arc.head:]) # 拿出来的相当于SBV主语词以后的部分。 Index_of_Subjet += 1 else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 ''' recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./ner.model") netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
import pandas as pd import literature import pyltp import pickle import os from trigger_dict import TriggerDict from math import inf MIN_SENTENCE_NUM = 140 STOP_WORD_PATH = './相关词表/停用词词表.txt' LTP_SEGMENT_MODE = './LTP_model/cws.model' LTP_POS_MODE = './LTP_model/pos.model' LTP_PARSE_MODE = './LTP_model/parser.model' SEGMENTOR = pyltp.Segmentor() POSTARGGER = pyltp.Postagger() PARSER = pyltp.Parser() with open('./相关词表/线索词词表.txt', 'r', encoding='utf-8') as f: CLUE_WORDS = f.read().splitlines() def load_model(): """ 加载LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.load(LTP_SEGMENT_MODE) POSTARGGER.load(LTP_POS_MODE) PARSER.load(LTP_PARSE_MODE) def release_model(): """ 释放LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.release() POSTARGGER.release()