def __init__(self, articles_filename='articles.csv', record_filename='record.csv', rule_reference_filename='rule_reference.txt', LTP_DIR="ltp_data_v3.4.0/", filter_dictionary=['有限公司']): self.articles_filename = articles_filename self.record_filename = record_filename self.rule_reference_filename = rule_reference_filename ###############################加载ltp相关模型######################################### self.LTP_DIR = LTP_DIR #分词模型 self.segmentor = pyltp.Segmentor() self.segmentor.load(os.path.join(self.LTP_DIR, "cws.model")) #词性模型 self.postagger = pyltp.Postagger() self.postagger.load(os.path.join(self.LTP_DIR, 'pos.model')) #命名实体模型 self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(os.path.join(self.LTP_DIR, 'ner.model')) #依存句法分析 self.parser = pyltp.Parser() self.parser.load(os.path.join(self.LTP_DIR, 'parser.model')) self.filter_dictionary = filter_dictionary self.left_postags_dict = {} self.left_word_dict = {} self.mid_postags_dict = {} self.mid_word_dict = {} self.right_postags_dict = {} self.right_word_dict = {} self.CMP_dict = {} self.SBV_dict = {} self.VOB_dict = {}
def build_files(self): """ 遍历原始文档,进行分词词性标注,去除停用词等,创建FileItem类集合 """ files = [] category_id = 0 segmentor = pyltp.Segmentor() segmentor.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\cws.hyp') postagger = pyltp.Postagger() postagger.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\pos.hyp') parser = pyltp.Parser() parser.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\parser.hyp') for ids, path in enumerate(self.file_paths()): with open(path, 'r', encoding='utf-8') as f: try: category = self.path2category(path) if category not in self.category_ids: self.category_ids[category] = category_id category_id += 1 raw = self.process_line(f.read()) words = self.remove_stop_words(list( segmentor.segment(raw))) words = self.clean_specific(words) pos = list(postagger.postag(words)) parse_result = list(parser.parse(words, pos)) files.append( FileItem(ids, category, words, pos, parse_result)) except UnicodeDecodeError: logging.warning(path + ' UTF-8解码失败,请检查文本格式') continue segmentor.release() postagger.release() parser.release() return files
def parser(words,postags): parser = pyltp.Parser() parser.load(par_model_path) arcs = parser.parse(words,postags) print('\t'.join('%d:%s'%(arc.head,arc.relation) for arc in arcs)) parser.release() return arcs
def __init__(self): model_path = '/home/lnn/Documents/postag/ltp_data_v3.4.0/' self.seg = pyltp.Segmentor() self.seg.load(model_path + 'cws.model') self.pos = pyltp.Postagger() self.pos.load(model_path + 'pos.model') self.parser=pyltp.Parser() self.parser.load(model_path+'parser.model')
def Parser(self, words=None, postags=None, sent=None): if self.__parser is None: self.__parser = pyltp.Parser() self.__parser.load(self.__par_model_path) print "Loaded Parser Model Success!" if sent is not None: words = self.Segmentor(sent) postags = self.Postagger(words) arcs = self.__parser.parse(words, postags) return arcs
def Parser(self, words=None, postags=None, sent=None): if self.__parser == None: self.__parser = pyltp.Parser() self.__parser.load(self.__par_model_path) if sent != None: words = self.Segmentor(sent) postags = self.Postagger(words) arcs = self.__parser.parse(words, postags) return arcs
def do_parers(self, words, postags): if self.parser is None: self.parser = pyltp.Parser() if self.debug: load_start = default_timer() self.parser.load(os.path.join(self.model_dir, 'parser.model')) if self.debug: load_use = default_timer() - load_start self.loger.debug("load parser.model use [ %f ] s" % load_use) arcs = self.parser.parse(words, postags) return arcs
def __init__(self): self.path = 'ltp_data_v3.4.0/' # 下载地址 https://ltp.ai/download.html 3.4.0 self.segmentor = pp.Segmentor() self.segmentor.load(self.path + "cws.model") # 加载分词模型 self.postagger = pp.Postagger() self.postagger.load(self.path + "pos.model") # 加载词性标注模型 self.recognizer = pp.NamedEntityRecognizer() self.recognizer.load(self.path + "ner.model") # 加载命名实体识别模型 self.parser = pp.Parser() self.parser.load(self.path + "parser.model") # 加载依存句法分析模型 self.labeller = pp.SementicRoleLabeller() self.labeller.load(self.path + "pisrl.model") # 加载语义角色标注模型
def __init__(self, *args, **kwargs): self.__LTP_DATA_DIR = 'D:\\NLP\\ltp_data' self.__cws_model_path = os.path.join(self.__LTP_DATA_DIR, 'cws.model') self.__pos_model_path = os.path.join(self.__LTP_DATA_DIR, 'pos.model') self.__par_model_path = os.path.join(self.__LTP_DATA_DIR, 'parser.model') self.segmentor = pyltp.Segmentor() self.segmentor.load_with_lexicon(self.__cws_model_path, './../data/word_dict.txt') self.postagger = pyltp.Postagger() self.postagger.load(self.__pos_model_path) self.parser = pyltp.Parser() self.parser.load(self.__par_model_path) self.tags_dict = {}
def __init__(self, ltp_path, dependency=False): self.dependency = dependency cws_model_path = os.path.join(ltp_path, 'cws.model') pos_model_path = os.path.join(ltp_path, 'pos.model') ner_model_path = os.path.join(ltp_path, 'ner.model') dp_model_path = os.path.join(ltp_path, 'parser.model') self.seg = pyltp.Segmentor() self.pos = pyltp.Postagger() self.ner = pyltp.NamedEntityRecognizer() # self.srl = pyltp.SementicRoleLabeller() self.seg.load(cws_model_path) self.pos.load(pos_model_path) self.ner.load(ner_model_path) # self.srl.load(srl_model_path) if dependency: self.dp = pyltp.Parser() self.dp.load(dp_model_path)
def __init__(self, seg_model_path = 'ltp_data_v3/ltp_data_v3.4.0/cws.model', seg_lexicon_path = 'lexicon/lexicon_test', pos_model_path = 'ltp_data_v3/ltp_data_v3.4.0/pos.model', rec_model_path = 'ltp_data_v3/ltp_data_v3.4.0/ner.model', par_model_path = 'ltp_data_v3/ltp_data_v3.4.0/parser.model'): self.seg_lexicon_path = seg_lexicon_path self.segmentor = pyltp.Segmentor() self.seg_model_path = seg_model_path self.segmentor.load_with_lexicon(self.seg_model_path,self.seg_lexicon_path) self.postagger = pyltp.Postagger() self.pos_model_path = pos_model_path self.postagger.load(self.pos_model_path) self.recognizer = pyltp.NamedEntityRecognizer() self.rec_model_path = rec_model_path self.recognizer.load(rec_model_path) self.parser = pyltp.Parser() self.par_model_path = par_model_path self.parser.load(self.par_model_path)
def ltp_init(self): import pyltp LTP_DATA_DIR = '/nas/data/m1/panx2/lib/ltp/ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') self.model_ltp_splitter = pyltp.SentenceSplitter() self.model_ltp_segmentor = pyltp.Segmentor() self.model_ltp_segmentor.load(cws_model_path) self.model_ltp_postagger = pyltp.Postagger() self.model_ltp_postagger.load(pos_model_path) self.model_ltp_recognizer = pyltp.NamedEntityRecognizer() self.model_ltp_recognizer.load(ner_model_path) self.model_ltp_dparser = pyltp.Parser() self.model_ltp_dparser.load(par_model_path) self.parse = self._parse self.sent_seger = self.ltp_sent_seger self.tokenizer = self.ltp_tokenizer self.processor = self.ltp_processor
def _model_initialize(self): if self.__segmentor == None: self.__segmentor = pyltp.Segmentor() if self.__seg_lexicon_path == None: self.__segmentor.load(self.__seg_model_path) else: self.__segmentor.load_with_lexicon(self.__seg_model_path, self.__seg_lexicon_path) if self.__postagger == None: self.__postagger = pyltp.Postagger() if self.__seg_lexicon_path == None: self.__postagger.load(self.__pos_model_path) else: self.__postagger.load_with_lexicon(self.__pos_model_path, self.__seg_lexicon_path) if self.__recognizer == None: self.__recognizer = pyltp.NamedEntityRecognizer() self.__recognizer.load(self.__rec_model_path) if self.__parser == None: self.__parser = pyltp.Parser() self.__parser.load(self.__par_model_path)
LTP_DIR = "/root/transaction/udf/model/ltp_data_v3.4.0" #分词模型 segmentor = pyltp.Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性模型 postagger = pyltp.Postagger() postagger.load(os.path.join(LTP_DIR, 'pos.model')) #命名实体模型 recognizer = pyltp.NamedEntityRecognizer() recognizer.load(os.path.join(LTP_DIR, 'ner.model')) #依存句法分析 parser = pyltp.Parser() parser.load(os.path.join(LTP_DIR, 'parser.model')) @tsv_extractor @returns(lambda doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags= "text[]", dep_types="text[]", dep_tokens="int[]", : []) def extract( doc_id="text", content="text", ): """ 使用pyltp 提取文本信息 """ #分句
def parse(words, postags): global parser_ if parser_ is None: parser_ = pyltp.Parser() parser_.load(ltp_models['parser']) return parser_.parse(words, postags)
def ltp_process(sentence, old_SI={}): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./model/cws.model") words = segmentor.segment(sentence) #print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./model/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 #print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./model/parser.model") arcs = parser.parse(words, postags) parser.release() # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./model/pisrl.model") roles = labeller.label(words, postags, arcs) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 依存句法分析 noun_tags = ['nh', 'nd', 'n', 'ni', 'nl', 'ns', 'nt', 'nz'] SI_words = {} # 词和索引 for tag in noun_tags: SI_index = np.argwhere(np.array(postags) == tag).reshape(-1).tolist() for j in SI_index: SI_words[words[j]] = j #print(SI_words) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 SI_postags = {} si_SBV_postag = [] si_VOB_postag = [] for arc in arcs: # SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 #print(arc.head, words[arc.head -1]) SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 # 加入主语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样 if old_SI: ag2entity = np.argmax(old_SI.params.keys()) words[Index_of_Subjet] = list( old_SI.params.keys())[ag2entity] else: pass Subject_label_set.append(words[Index_of_Subjet]) else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet if postags[arc.head - 1] == 'v': si_SBV_postag.append( (words[Index_of_Subjet], Index_of_Subjet)) Word_of_speech_content.append( intro_speech(''.join(words[arc.head:]))) # 拿出所说内容。 #print(intro_speech(''.join(words[arc.head:]))) Index_of_Subjet += 1 SI_postags[arc.relation] = si_SBV_postag elif arc.relation == 'VOB' and words[arc.head - 1] not in stop_words: # 加入宾语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 引入前一句的宾语位置和积分最高元素 pass else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Index_of_Subjet += 1 SI_postags[arc.relation] = si_VOB_postag else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 Forcus_point = Si(SI_words, SI_postags, old_SI) # 关注焦点集 # 需要更新self.params Forcus_point.score() recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./model/ner.model") netags = recognizer.recognize(words, postags) #print("\t".join(netags)) ''' labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
def ltp_process(sentence): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./cws.model") words = segmentor.segment(sentence) print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./parser.model") arcs = parser.parse(words, postags) parser.release() # 角色分析,暂时没用上 # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 for arc in arcs: #SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 Subject_label_set.append( words[Index_of_Subjet]) # 如果有SBV,那么这个词对应的位置肯定是主语 Word_of_speech_content.append( words[arc.head:]) # 拿出来的相当于SBV主语词以后的部分。 Index_of_Subjet += 1 else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 ''' recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./ner.model") netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
import pandas as pd import literature import pyltp import pickle import os from trigger_dict import TriggerDict from math import inf MIN_SENTENCE_NUM = 140 STOP_WORD_PATH = './相关词表/停用词词表.txt' LTP_SEGMENT_MODE = './LTP_model/cws.model' LTP_POS_MODE = './LTP_model/pos.model' LTP_PARSE_MODE = './LTP_model/parser.model' SEGMENTOR = pyltp.Segmentor() POSTARGGER = pyltp.Postagger() PARSER = pyltp.Parser() with open('./相关词表/线索词词表.txt', 'r', encoding='utf-8') as f: CLUE_WORDS = f.read().splitlines() def load_model(): """ 加载LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.load(LTP_SEGMENT_MODE) POSTARGGER.load(LTP_POS_MODE) PARSER.load(LTP_PARSE_MODE) def release_model(): """ 释放LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.release() POSTARGGER.release()
def ltp_process(sentence, old_SI={}): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./model/cws.model") words = segmentor.segment(sentence) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./model/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 postagger.release() parser = pyltp.Parser() parser.load("./model/parser.model") arcs = parser.parse(words, postags) parser.release() noun_tags = ['nh','nd','n','ni','nl','ns','nt','nz'] # nh:person name, nd:direction noun, n:general noun, ni:organization name, nl:location noun # ns: geographical name, nt:temporal noun, nz:other proper noun SI_words = {} # 词和索引 for tag in noun_tags: # 找出noun_tags词性,对应词在句子的位置 SI_index = np.argwhere(np.array(postags)==tag).reshape(-1).tolist() # SI_index 存储符合tag词性的词的位置 for j in SI_index: # 找到该词,并记录该词位置 蒋丽芸 SI_words[words[j]] = j #print(SI_words) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 SI_postags = {} si_SBV_postag = [] si_VOB_postag = [] for arc in arcs: # SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 #print(arc.head, words[arc.head -1]) SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 # 加入主语的判断 if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']: # 进行指代消解 # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样 if old_SI: ag2entity = np.argmax(old_SI.params.keys()) words[Index_of_Subjet] = list(old_SI.params.keys())[ag2entity] else: pass Subject_label_set.append(words[Index_of_Subjet]) else: Subject_label_set.append(words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet if postags[arc.head -1] == 'v': si_SBV_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Word_of_speech_content.append(intro_speech(''.join(words[arc.head:]))) # 拿出所说内容。 #print(intro_speech(''.join(words[arc.head:]))) Index_of_Subjet += 1 SI_postags[arc.relation] = si_SBV_postag elif arc.relation == 'VOB' and words[arc.head -1] not in stop_words: # 加入宾语的判断 if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']: # 进行指代消解 # 引入前一句的宾语位置和积分最高元素 pass else: Subject_label_set.append(words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Index_of_Subjet += 1 SI_postags[arc.relation] = si_VOB_postag else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 Forcus_point = Si(SI_words, SI_postags,old_SI) # 关注焦点集 # 需要更新self.params Forcus_point.score() return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
def __init__(self): self.tagger = pyltp.Postagger() self.parser = pyltp.Parser() self.tagger.load(path_to_tagger) self.parser.load(path_to_parser)