def seg(content): # Set your own model path MODELDIR="/home/liuqi/ltp/pyltp/ltp_data/" segmentor = Segmentor() segmentor.load(MODELDIR+"cws.model") tWords = segmentor.segment(content) return tWords
def split_words(sentence = "中国进出口银行与中国银行加强合作",type_list=0): """分词,若type_list=True,则返回以列表返回分词后的结果。""" segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) if type_list: return [i for i in words] return words
class pnn_count(): def __init__(self): self.mydict = {} self.segmentor = Segmentor() self.segmentor.load('cws.model') self.hash_dict() self.ltp_process() def ltp_process(self): sentence_num = 0 right_num = 0; f = open('pnn_annotated.txt','r') for line in f: sentence_num += 1 #print line line_array = line.split('\t') line = line_array[1] count = 0 words = self.segmentor.segment(line) for i in words: if self.mydict.has_key(i): count = count + self.mydict[i] if count > 0: answer = "positive" if line_array[0] == '1': right_num += 1 elif count == 0: answer = "neuter" if line_array[0] == '0': right_num += 1 else: answer = "negative" if line_array[0] == '-1': right_num += 1 #print "My guess is %s" %answer #print "THe right answer is %s" %line_array[0] #print "result %d" % count f.close() print "total sentence is %d, right answer is %d" %(sentence_num,right_num) def hash_dict(self): f = open('negative.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = -1 f.close() f = open('positive.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = 1 f.close()
def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) : self.raw_data = None self.processed_data = None self.words_dict = None self.STOP_WORDS = self._load_stop_words(stop_words_dir) self.segmentor = Segmentor() self.segmentor.load(cws_model_path)
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def __init__(self): self.mydict = {} self.lines = [] self.lines_num = 3000 self.c = [0,0,0] #PNN self.w_c = [{},{},{}] self.segmentor = Segmentor() self.segmentor.load('cws.model') self.read_file() self.train() self.test()
def pyltp_words(): from pyltp import Segmentor, Postagger segmentor = Segmentor() segmentor.load("/home/fredgan/github/pyltp/ltp_data/cws.model") # postagger = Postagger() # postagger.load("~/github/pyltp/ltp_data/cpos.model") for line in open(sys.argv[1], 'r'): try: style,sentence = line.strip().split('\t') except: continue style_dic.setdefault(style, {}) words = segmentor.segment(sentence) # postags = postagger.postag(words) for w in words: if w in style_dic[style]: style_dic[style][w] += 1 else: style_dic[style][w] = 1 for k,v in style_dic.iteritems(): v_list = sorted(v.iteritems(), key = lambda d:d[1], reverse = True) print k+ "\t" + " ".join(map(lambda i:i[0] + ":" +str(i[1]), v_list[0:min(50,len(v_list))]))
def process(index): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) finname = "o_"+str(index)+".txt" foutname = "p_"+str(index)+".txt" print finname count = 0 fin = codecs.open(finname, encoding='utf-8') with codecs.open(foutname, 'w', encoding="utf-8") as fout: while 1: line = fin.readline() if not line: break tmp = line.split(" ^ {")[1] # Get JSON tmp = "{"+tmp data = json.loads(tmp) content = data['content'] # error_correction(content) content = content.strip() segmentation = "" for line in content.split("\n"): line = line.encode("utf-8") words = segmentor.segment(line) segmentation += "/".join(words) segmentation += "/" # Return type of the function is str, not unicode. Thus need to change into unicode. segmentation = unicode(segmentation, "utf-8") pinyin = add_pinyin(segmentation) obj = {} obj['flavor'] = data['flavor'] obj['environment'] = data['environment'] obj['service'] = data['service'] obj['content'] = data['content'] obj['segmentation'] = segmentation obj['pinyin'] = pinyin tmpstr = json.dumps(obj,ensure_ascii=False) fout.write(tmpstr) fout.write('\n') count += 1 print count segmentor.release()
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
class ltpTools(): def __init__(self): #initialize every ltp tool LTP_DIR = "/home/demo1/support_ltp" #分词器 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) #依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #命名实体识别 #self.recognizer = NamedEntityRecognizer() #self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) #语义角色标注模块 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print('模型已全部加载') def __del__(self): self.segmentor.release() self.labeller.release() self.postagger.release() self.postagger.release() print('模型已全部释放') def segANDpos(self, sen): ''' 分词加词性标注,同时返回词列表和词性列表,一一对应 ''' words = self.segmentor.segment(sen) postags = self.postagger.postag(words) return list(words), list(postags) '''语义角色标注''' def format_labelrole(self, words, postags): #依赖于词性的标注,做依存句法的分析 #解释: #依存句法分析是基于词性标注的。 arcs = self.parser.parse(words, postags) #根据依存句法的分析,标注语义角色 roles = self.labeller.label(words, postags, arcs) #以字典储存,key为编号,value为列表 #而且是嵌套字典,以arg.name作为key #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } print(roles_dict) return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): #其数据结构是: #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子 child_dict_list = [] #这个list的意义就是展示每个词的依存关系 format_parse_list = [] #一级循环:对每个词分析 for index in range(len(words)): #预设孩子字典 child_dict = dict() #二级循环:查每个词的语义角色 for arc_index in range(len(arcs)): #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下 if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): '''显然这是一个类的主函数''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
class tokenization_entis(): def __init__(self): self.LTP_DATA_DIR = "/home/mm/Downloads/ltp_data_v3.4.0/" self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(self.cws_model_path) # 加载模型 self.train_res = self.read_train_res() # 读取tag文本,防止里面有空格去掉空格 # self.all_co_names = self.FDDC_co_list() def read_train_res(self): with open( '/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train' ) as rf: train_res = rf.read() train_res = re.sub(r'\(', '(', train_res) train_res = re.sub(r'\)', ')', train_res) return train_res def ltp_segmentor_release(self): self.segmentor.release() def tokenize_enti(self, path11): texx, entity_string = convert2txt(path11) # sentences = re.split(r'。', texx) # sentences.sort(key=len, reverse=True) entities = list(set(re.split(r'[\s~、,;/]', entity_string))) entities.sort(key=len) entities_arrows_list = list( set([ x if '~' in x else '' for x in re.split(r'\s', entity_string) ])) entities_arrows_list.sort(key=len, reverse=True) entities_arrows_list = entities_arrows_list[:-1] # 找出结果数据行并且把最后的回车符号去掉 patt_index = re.findall(r'\d{4,10}', path11)[0] res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index), self.train_res) # 以下是整理train——res # 遍历结果,发现有简称全称的,把匹配的另一半加进去。 """主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现, 使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先 把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的""" res_paired = {} # 临时定义一个res的列表,存储修改后的train res for x in range(len(res_rows)): res_row = res_rows[x] for y in range(6): res_paired[str(x) + str(y)] = [re.split(r'\t', res_row)[y]] for arrow_str in entities_arrows_list: for index, result_row in enumerate(res_rows): for indi, res_value in enumerate(re.split(r'\t', result_row)): if indi in [0, 1, 4, 5]: continue res_value_list = res_value.split('、') for res_value_split in res_value_list: if res_value_split in entities and res_value_split in arrow_str: # 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue niki, fullna = re.split(r'~', arrow_str) fullna_first = fullna.split(',')[0] niki_split_list = re.split(r'[/、]', niki) # 对应的全称满足三个条件,长度/逗号 以及含有简称的几个字 if res_value_split in niki_split_list \ and len(fullna_first) < 18 \ and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first): res_paired[str(index) + str(indi)].append(fullna_first) """ 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/, 含有这几个字的要剔除 """ if res_value_split == fullna_first: # 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于 for niki_split in niki_split_list: if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split) \ and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)', re.sub(r'\s', '', niki_split)): res_paired[str(index) + str(indi)].append( niki_split) # 遍历公告的每一句,把每一句送进模型。 # words_n_words = '' # for i in sentences: words = self.segmentor.segment(entity_string) words = ' '.join(words) # 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程 # # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来, # for ent in entities: # # 把words中所有是实体的中间去掉空格。使用双层sub # # 正则还是要多注释啊 # """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?” # 表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words # 中出现这个序列的地方,将其换成没空格的""" # if len(ent) > 1: # if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent): # 如果没有股权关键字,直接加上空格匹配pattern # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词 # patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent)) # else: # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用 # words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words) # 然后把空格都换成回车,words竖起来了。 # words = re.sub(r'\s', '\n', words) # words = re.sub(r'\n+', '\n', words) """把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o 目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束, 但是这样的正则太难了, 我无法将所有合适的实体 全部抽出来,而导致标注的缺失,那么还是把任务给模型了""" # for x in range(len(res_rows)): # for y in range(6): # index = str(x)+str(y) # tags_list = res_paired[index] for index, tags_list in res_paired.items(): # 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。 for sub_res in sorted(tags_list, key=len, reverse=True): if not index.endswith('0') and len(sub_res) > 1: patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res) if re.search(r'{}'.format(patt_sub_res), words): spliter = re.findall(patt_sub_res, words)[0] words_split_list = re.split(spliter, words) spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]), spliter) words = spliter_tagged.join(words_split_list) # print(words) # words=re.sub(patt_sub_res, sub_res) # words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words) # train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o words = re.sub(r'\s', '\to\n', words) words = re.sub(r'(?<=屄\d)', '\n', words) words = re.sub(r'屄', '\t', words) return words
def getRelation(paragraph): """ paragraph: a list of string, each string is a sentence return: a list of relations and a dict which records the number of occurrence of differents DSNF """ relations = [] dict_DSNF = { 'num_DSNF1': 0, 'num_DSNF2': 0, 'num_DSNF3': 0, 'num_DSNF7': 0, } segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) for iteration, sentence in enumerate(paragraph): print("evaluate the " + str(iteration + 1) + "-th sentences") sentence = SentenceSplitter.split(sentence)[0] words = segmentor.segment(sentence) # print("\t".join(words)) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # print("\t".join(postags)) arcs = parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) netags = recognizer.recognize(words, postags) # print("\t".join(netags)) # labeller = SementicRoleLabeller() # labeller.load(os.path.join(MODELDIR, "pisrl.model")) # roles = labeller.label(words, postags, arcs) # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) entityList = findEntities(netags) # print(entityList) entities = [] for i in entityList: l = '' for j in i: l += words[j] entities.append(l) print("entities in " + str(iteration + 1) + "-th sentence : ", entities) DSNF1_ret = DSNF1(arcs, entityList, words, netags) DSNF2_ret = DSNF2(arcs, entityList, words) DSNF3_ret = DSNF3(arcs, entityList, words, postags) DSNF7_ret = DSNF7(arcs, entityList, words) # print("DSNF1 result: ", DSNF1_ret) # print("DSNF2 result: ", DSNF2_ret) # print("DSNF3 result: ", DSNF3_ret) # print("DSNF7 result: ", DSNF7_ret) relation = [] for r in DSNF1_ret: dict_DSNF['num_DSNF1'] += 1 relation.append(r) relations.append(r) for r in DSNF2_ret: dict_DSNF['num_DSNF2'] += 1 relation.append(r) relations.append(r) for r in DSNF3_ret: dict_DSNF['num_DSNF3'] += 1 relation.append(r) relations.append(r) for r in DSNF7_ret: dict_DSNF['num_DSNF7'] += 1 relation.append(r) relations.append(r) print("with entities relation: ", relation) print("--" * 30) segmentor.release() postagger.release() parser.release() recognizer.release() # labeller.release() return relations, dict_DSNF
class NLPExecutor: def __init__(self): self.seg = Segmentor() self.seg.load(cwsPath) self.pos = Postagger() self.pos.load(posPath) self.parser = Parser() self.parser.load(parserPath) self.tr = TextRank4Sentence() ''' param: text:输入文本 return: 摘要的句子list ''' def generateSummary(self, text): # TODO 摘要生成实现方法待改进 self.tr.analyze(text=text) return self.tr.get_key_sentences(num=1) ''' param: text:输入文本 return: 分句的句子list ''' def splitSentences(self, text): return list(SentenceSplitter.split(text)) ''' param: sent1,sent2:两个句子 return: 两个句子的相似度 ''' def similarity(self, sent1, sent2): if sent1 == '' or sent2 == '': return 0 text1 = self.wordTokenize(sent1) text2 = self.wordTokenize(sent2) texts = [text1, text2] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary)) return similarity[dictionary.doc2bow(text1)][1] # TODO VALIADATES THAT添加放在RUCM生成层 ''' def addValidate(self,sentence): tokens=self.wordTokenize(sentence) tokens[1]='VALIDATES THAT' return ''.join(tokens) ''' ''' param: sentence:一个句子 return: 分词词链,list,标点符号会被作为一个词 ''' def wordTokenize(self, sentence): return list(self.seg.segment(sentence)) ''' param: sentence:一个句子 wordlist:分词词链 return: 仅有词性标注的词性链,index与分词词链对应 ''' def posTag(self, sentence=None, wordlist=None): if sentence is not None: wordlist = list(self.seg.segment(sentence)) return list(self.pos.postag(wordlist)) ''' param: sentence:分词词典的文件路径,每个词独占一行的纯文本文件 wordlist:标注词典的文件路径,每个词及其词性占一行,词与词性标注之间空格分隔,可以有多个词性 return: 无 ''' def dictUpdate(self, segDict=None, posDict=None): if segDict is not None: self.seg.load_with_lexicon(cwsPath, segDict) if posDict is not None: self.pos.load_with_lexicon(posPath, posDict) ''' param: sentence:原始句子 wordlist:句子的分词词链 poslist:词性标注词链 return: 依存句法分析结果 ''' def parse(self, wordlist=None, text=None): if text is not None: wordlist = self.wordTokenize(text) poslist = self.posTag(wordlist=wordlist) return list(self.parser.parse(wordlist, poslist)) ''' param: sentence:Sentence对象 parselist:依存句法分析结果 return: 规范化句式之后的句子 ''' def normalize(self, sentence, parselist=None): # TODO 效果在调试时继续调整): wordlist = sentence.wordlist poslist = self.posTag(wordlist=wordlist) if parselist is None: parselist = self.parse(wordlist=wordlist) newWords = wordlist.copy() # TODO 替换IF,ELSE,THEN,DO,UNTIL #if sentence.type == 'conditional': # TODO if sentence.type != 'then': for i in range(0, len(wordlist)): if wordlist[i] == '如果': newWords[i] = 'IF' sentence.type = 'conditional' elif wordlist[i] == '那么': newWords[i] = 'THEN' elif wordlist[i] == '否则': newWords[i] = 'ELSE' elif wordlist[i] == '直到': newWords[i] = 'UNTIL' if sentence.type != 'conditional': sentence.type = 'circular' elif wordlist[i] == '同时': newWords[i] = 'MEANWHILE' #TODO 去量词效果 if sentence.type == 'then' or sentence.type == 'normal': for i in range(len(parselist) - 1, -1, -1): if parselist[i].relation == 'ATT' and (poslist[i] == 'm' or poslist[i] == 'q'): del newWords[i] if sentence.normalContent is None: sentence.normalContent = '' for word in newWords: sentence.normalContent += word ''' param: parselist:依存句法分析结果 return: 是否为简单句 ''' def isSimple(self, parselist): count = 0 for parse in parselist: if parse.relation == 'SBV': count += 1 if count == 1: return True else: return False ''' param: sentlist:句子集合 sent:单个句子 return: sentlist中与sent相似度最高的句子的索引与相似度 ''' def maxSimilarity(self, sentlist, sent): max = [-1, -1] for i in range(len(sentlist)): similarity = self.similarity(sentlist[i].originContent, sent.originContent) if similarity > max[1]: max = [i, similarity] return max
import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR = os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load( os.path.join( MODELDIR, "/home/zhangxin/work/LTP/ltp-models/3.3.1/ltp_data/cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load( os.path.join( MODELDIR, "/home/zhangxin/work/LTP/ltp-models/3.3.1/ltp_data/pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` def extract_institute(text): words = customized_segmentor.segment(text) # 分词 print '\t'.join(words) postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 return "\t".join(netags) if __name__ == '__main__': customized_segmentor = Segmentor() # 初始化实例 customized_segmentor.load(cws_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 print '伊斯兰国', extract_institute('伊斯兰国') print '发改委', extract_institute('发改委') print '中华人民共和国国家发展和改革委员会', extract_institute('中华人民共和国国家发展和改革委员会') """ f = open("zhwiki.json") for line in f: item = json.loads(line.strip()) text = item["text"] print text
def ws_data(self): f = open("pnn_annotated.txt", 'r') total_line = 0 orgin_attr = [0, 0, 0] judge_attr = [0, 0, 0] right = [0, 0, 0] segmentor = Segmentor() segmentor.load("cws.model") for line in f: total_line += 1 # print 'line has been read' value_num = [0, 0] result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print 'this line is %s' % (line) for i in ws_lst: classify = '' try: value = self.setiment_words[i] except: pass else: if value == 1: print 'positive word:%s' % i value_num[0] += 1 elif value == -1: print 'negative word:%s' % i value_num[1] += 1 if value_num[0] == 0 and value_num[1] == 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] == value_num[1] != 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] > value_num[1]: classify = 'positive' judge_attr[1] += 1 else: classify = 'negative' judge_attr[2] += 1 print value_num print 'classfiy result:%s' % classify # the count of original'emotion if result[0] == '0': orgin_attr[0] += 1 elif result[0] == '1': orgin_attr[1] += 1 else: orgin_attr[2] += 1 if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0): # print 'neutral' right[0] += 1 elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0): # print 'neutral' right[0] += 1 elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0): # print 'positive' right[1] += 1 elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0): # print 'negative' right[2] += 1 # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) print 'orgin\'s neutral, positive, negative' print orgin_attr print 'judge_attr neutral, positive, negative' print judge_attr print 'neutral, positive, negative' print right print (right[0] + right[1] + right[2]) print 'total_line %f\n' % total_line print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) segmentor.release()
class LtpParser(): def __init__(self): LTP_DIR = "./ltp_data" self.lac = LAC(mode='lac') self.lac.load_customization('data/custom.txt', sep=None) self.ddparser = DDParser(encoding_model='transformer') self.fine_info = FineGrainedInfo self.keyword = Keyword() self.jieba = jieba self.posseg = jieba.posseg self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger( model_path=os.path.join(LTP_DIR, "pos.model")) self.parser = Parser(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer( os.path.join(LTP_DIR, "ner.model")) '''ltp基本操作''' def basic_parser(self, words): postags = list(self.postagger.postag(words)) netags = self.recognizer.recognize(words, postags) return postags, netags '''ltp获取词性''' def get_postag(self, words): return list(self.postagger.postag(words)) '''基于实体识别结果,整理输出实体列表''' def format_entity(self, words, netags, postags): name_entity_dist = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word + '_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word + '_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh') name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni') name_entity_dist['nss'] = self.modify_entity(place_entity_list, words, postags, 'ns') return name_entity_dist '''entity修正,为rebuild_wordspostags做准备''' def modify_entity(self, entity_list, words, postags, tag): entity_modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['start_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [ words[int(start_index)] + '/' + postags[int(start_index)] ] else: consist = [ words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index) + 1) ] entity_dict['consist'] = consist entity_dict['name'] = ''.join( tmp.split('_')[0] for tmp in subs) + '/' + tag entity_modify.append(entity_dict) return entity_modify '''基于命名实体识别,修正words,postags''' def rebuild_wordspostags(self, name_entity_dist, words, postags): pre = ' '.join( [item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [ word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0] ] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''依存关系格式化''' def syntax_parser(self, words, postags): arcs = self.parser.parse(words, postags) words = ['Root'] + words postags = ['w'] + postags tuples = list() for index in range(len(words) - 1): # arc_index = arcs[index].head arc_index = arcs[index][0] # arc_relation = arcs[index].relation arc_relation = arcs[index][1] tuples.append([ index + 1, words[index + 1], postags[index + 1], words[arc_index], postags[arc_index], arc_index, arc_relation ]) return tuples '''为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, tuples): child_dict_list = list() for index, word in enumerate(words): child_dict = dict() for arc in tuples: if arc[3] == word: if arc[-1] in child_dict: child_dict[arc[-1]].append(arc) else: child_dict[arc[-1]] = [] child_dict[arc[-1]].append(arc) child_dict_list.append([word, postags[index], index, child_dict]) return child_dict_list '''parser主函数''' def parser_main(self, words, postags): tuples = self.syntax_parser(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, tuples) return tuples, child_dict_list '''基础语言分析,ltp标注词性还输出命名实体识别,导致两者不一样,所以rebuild转换''' def basic_process(self, sentence): words = list(self.segmentor.segment(sentence)) postags, netags = self.basic_parser(words) name_entity_dist = self.format_entity(words, netags, postags) words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags) return words, postags
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __doc__ = 'description' __author__ = '*****@*****.**' import os from pyltp import Segmentor rootPath = 'C:\\Users\\13314\\Desktop\\Bi-LSTM+CRF\\' ltpPath = os.path.join(rootPath, 'ltp_data_v3.4.0') #分词器 SEGMENTOR = Segmentor() #初始化词性标注模型 SEGMENTOR.load_with_lexicon(os.path.join(ltpPath,'cws.model'), os.path.join(ltpPath,'userDict.txt')) if __name__ == '__main__': pass
class Robot(): def __init__(self, theOptions): self.options = theOptions self.minus_verbs = self.loadMinusVerbs() ###ltp # -*- coding: utf-8 -*- import os LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 cws_model_path = './/knols//ltp_data//cws.model' # 分词模型路径,模型名称为`cws.model` pos_model_path = './/knols//ltp_data//pos.model' # 词性标注模型路径,模型名称为`pos.model` ner_model_path = './/knols//ltp_data//ner.model' # 词性标注模型路径,模型名称为`ner.model` from pyltp import Segmentor self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 #words = self.segmentor.segment('元芳你怎么看') # 分词 from pyltp import Postagger self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 from pyltp import NamedEntityRecognizer self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 def tableKG(self, mathKG): #http://www.lxway.com/46509591.htm #https://segmentfault.com/q/1010000008847010 x = PrettyTable([u"执行者", u"角色[所有者|干预者]", u"物品", u"操作符", u"数量"], encoding=sys.stdout.encoding) x.align[u"物品"] = "1" #以姓名字段左对齐 x.padding_width = 1 # 填充宽度 for owner in mathKG['owners']: for wupin in owner['wupin']: x.add_row([owner['name'],owner['role'], wupin['name'],wupin['operator'], wupin['amount']]) print x def getResultInNatLaguage(self, result, tigan, timu, message= None): timu_origin = copy.deepcopy(timu) timu = re.sub(u'(?|\?)', u'。', timu) print tigan.encode('gbk'), timu_origin.encode('gbk') if result: print u'答:', re.sub(u'(多少|几)', str(result), timu) if self.options.debug: print message # if result== 0: # print tgkg['persons'][0]['name'],u'与', tgkg['persons'][1]['name'],u'一样多' # if result >0: # print tgkg['persons'][0]['name'],u'比', tgkg['persons'][1]['name'],u'多',result # if result <0: # print tgkg['persons'][0]['name'],u'比', tgkg['persons'][1]['name'],u'少',result def assignRoleToPlayers(self, tigan_kg, timu_kg): if tigan_kg is None or timu_kg is None: return if len(timu_kg['owners']) ==0: for i in range(len(tigan_kg['owners'])): if i == 0: tigan_kg['owners'][i]['role'] = u'所有者' else: tigan_kg['owners'][i]['role'] = u'干预者' def process(self, question, theOptions, scoringOnly= False): tigan = question['robot']['tigan'] timu = question['robot']['timu'] posTagger = 'jieba' tgkg = None tgkg_a = self.calc(tigan, 'TIGAN',theOptions, scoringOnly, pos_tag_method=posTagger) if len(tgkg_a['owners']) == 0: tgkg_b = self.calc(tigan, 'TIGAN',theOptions, scoringOnly, pos_tag_method='ltp') if len(tgkg_b['owners']) > 0: posTagger = 'ltp' tgkg = copy.deepcopy(tgkg_b) else: tgkg = copy.deepcopy(tgkg_a) tmkg = self.calc(timu, 'TIMU', theOptions, scoringOnly, pos_tag_method=posTagger) self.assignRoleToPlayers(tgkg, tmkg) self.updateKGOperators(tgkg, tigan) result = None if tgkg is None: return result msg = {} #result = int(tgkg['owners'][0][attributs[0]]['amount'] - tgkg['owners'][1][attributs[0]]['amount']) msg['ower_size'] = len(tgkg['owners']) result_ex = '0' if msg['ower_size'] > 0: for number in tgkg['owners'][0]['wupin']: result_ex += number['operator'] + str(number['amount']) result = eval(result_ex) if not scoringOnly: print result_ex self.getResultInNatLaguage(result, tigan, timu, message = msg) self.tableKG(tgkg) return result def getMyWords(self, timu, toDoDebug, keepSilent, pos_tag_method='jieba'): ''' pos_tag_method: jieba | ltp ''' if pos_tag_method=='jieba': words = pseg.cut(timu) myWords = [] for w in words: myWord = NewWord() ###分词矫正 if w.word in [u'爸爸',u'妈妈']: w.flag = 'nr' myWord.word = w.word myWord.flag = w.flag myWords.append(myWord) toBeRemoved = [] for index in range(len(myWords)): if myWords[index].flag == 'ns': if index -1 >=0: if myWords[index-1].flag == 'n': myWords[index].word = myWords[index-1].word + myWords[index].word toBeRemoved.append(index-1) newMyWords = [] for index in range(len(myWords)): if index in toBeRemoved: continue else: newMyWords.append(myWords[index]) return newMyWords, [] ###ltp # -*- coding: utf-8 -*- debug_msgs = [] if pos_tag_method == 'ltp': words = self.segmentor.segment(timu.encode('utf8')) postags = self.postagger.postag(words) debug_msgs.append('\t'.join(postags)) myWords = [] for i in range(len(words)): w = words[i] myWord = NewWord() myWord.word = w.decode('utf8') myWord.flag = postags[i] myWords.append(myWord) #print w.decode('utf8').encode('gbk') debug_msgs.append(w.decode('utf8')) #netags = self.recognizer.recognize(words, postags) # 命名实体识别 return myWords, debug_msgs def loadMinusVerbs(self): math_kg_file = './/knols//mathKG.json' sContent = ''.join(open(math_kg_file, 'r').readlines()) sContent = sContent.decode('utf8') return sContent.split('|') def getOperator(self, number, startPosition, timu, theKG): numberPos = timu.find(number, startPosition) endPos = numberPos + len(number) theText = timu[startPosition:endPos] #minus_verb_str = u'用了|用去了|买了|飞走|送给|拔|败|剥|爆|吃了|完成|拿下来|分给|没来|矮|坳|扒|拔|罢|败|病|擦|裁|残|差|拆|扯|撤|沉|惩|迟|抽|掉|黜|辞|倒|丢|夺|剁|废|分|负|过去|割|刮|化|剪|借|砍|看了|亏|离|漏|掠|抹|免|排|磨|抹|赔|劈|骗|弃|迁|抢|切|取|扫|杀|删|少|失|剩|淘|剔|偷|退|忘|违|误|削|消|逊|湮灭|掩饰|游走|凿|折|遮|坠|啄|走' #minus_verbs = minus_verb_str.split('|') rlt = re.search(u'({theMinusPattern})\w*'.format(theMinusPattern = '|'.join(self.minus_verbs)) + number, theText) if rlt: raw_input(rlt.group(0)) ####disambiguate if len(theKG['owners']) == 1: if rlt.group().find(u'买了')>=0: return endPos + 1, '+' return endPos + 1, '-' else: newText = timu[numberPos:] #raw_input(newText.encode('gbk')) rlt2 = re.search(number + u'\w*({theMinusPattern})'.format(theMinusPattern = '|'.join(self.minus_verbs)), newText) if rlt2: return numberPos + len(rlt2.group(0)), '-' else: return endPos + 1, '+' def getEntities(self, words, posTaggingMethod): entities = {} entities['owner'] = [] entities['wupin'] = [] entities['numbers'] = [] for w in words: print w.word, w.flag if posTaggingMethod == 'jieba': for w in words: #nr 人名#s 处所词; #f 方位词#s 处所词#ns 地名 #r 代词 if w.flag in ['nr', 's', 'f', 'ns', 'r']: if not w.word in entities['owner']: entities['owner'].append(w.word) if w.flag in ['m']: entities['numbers'] if re.search('\d', w.word): entities['numbers'].append( w.word) if w.flag in ['n', 'nr']: if not w.word in entities['wupin']: entities['wupin'].append(w.word) if posTaggingMethod == 'ltp': posTagList = [] for w in words: posTagList.append(w.flag) for i in range(len(words)): w = words[i] if w.flag in ['nh']: if not w.word in entities['owner']: entities['owner'].append(w.word) '''爸爸 养 了 6 条 红色 金鱼 n v u m q n n ''' if i == 0: if words[i+1].flag == 'v': entities['owner'].append(w.word) if w.flag in ['m']: if i == 0: if words[i+1] == 'q': '''一 年 12 个 月''' entities['owner'].append(w.word + words[i+1].word) continue entities['numbers'] if re.search('\d', w.word): entities['numbers'].append( w.word) if w.flag in ['n', 'nr']: if i>=2: is_qualified = False if words[i-2].flag == 'm' and words[i-1].flag == 'q': is_qualified = True if words[i-1].flag == 'm' : is_qualified = True if is_qualified: if not w.word in entities['wupin']: entities['wupin'].append(w.word) ''' 十一月份 总共 30 天 ''' if w.flag in ['nt']: entities['owner'].append(w.word) ''' 一 年 12 个 月 , 过去 了 10 个 月 m q m q n ''' if ' '.join(posTagList).find('m q m q') == 0: entities['owner'].append(words[0].word + words[1].word) ####{'owner': [], 'wupin': [u'\u4eba', u'\u620f'], 'numbers': [u'22', u'13']} return entities def updateKGOperators(self, math_kg, tigan_text): if math_kg is None: return for m in range(len(math_kg['owners'])): oneOwner = math_kg['owners'][m] for n in range(len(oneOwner['wupin'])): oneWupin = oneOwner['wupin'][n] int_pos = int(oneWupin['number_position']) number = oneWupin['amount'] pos, theOperator = self.getOperator(str(number), int_pos, tigan_text, math_kg) #if oneOwner['role'] == math_kg['owners'][m]['wupin'][n]['operator'] = theOperator def disabiguateOwners(self, owner_names): for prn in [u'他', u'她', u'它', u'他们', u'她们', u'它们']: if prn in owner_names: pos = owner_names.index(prn) if pos >0: owner_names.remove(prn) def isRealOwnerWupinNumberTriple(self, owner, wupin_name, number, tigan): sents = tigan.split(u',') for sent in sents: import chardet #if owner['name'] in sent and str(number) in sent: #bug: 小刚 3 小刚存了43元 #因为3不属于小刚,但是刚好43字母中含有3,所以误判。 #因此,上面的if 语句不是一个号的判断。 #应该将数字都提取出来,然后做判断 regex=re.compile('\d+') allNumbersInSent = regex.findall(sent) if owner['name'] in sent and str(number) in allNumbersInSent: print owner['name'], number, sent raw_input() tmpName = u'比' + owner['name'] if tmpName in sent: return False return True return False def calc(self, word_math_problem, section, theOptions, scoringOnly, pos_tag_method='jieba'): ''' pos_tag_method: part of speech tagging method ''' kg = {} words, debugMsgList = self.getMyWords(word_math_problem, theOptions.debug, scoringOnly, pos_tag_method= pos_tag_method) entities = self.getEntities(words, pos_tag_method) self.disabiguateOwners(entities['owner']) #https://wenku.baidu.com/view/03abd6f70508763231121250.html kg['owners'] = [] #owner could be person or 处所词或方位词 wupin_name = None #草地上3只小鸡在做游戏 #小鸡 游戏都是wupin,取其一。 ####去除与实施者相同的物品 for ownerName in entities['owner']: if ownerName in entities['wupin']: entities['wupin'].remove(ownerName) if len(entities['wupin']) > 0: wupin_name = entities['wupin'][0] for ownerName in entities['owner']: owner = {'name':ownerName, 'role':None} if section == 'TIGAN': owner['wupin'] = [] pos = 0 for number in entities['numbers']: if self.isRealOwnerWupinNumberTriple(owner, wupin_name, number, word_math_problem): #pos, theOperator = self.getOperator(str(number), pos, timu) owner['wupin'].append({'amount':str(number), 'number_position': pos, 'name':wupin_name, 'operator': None}) kg['owners'].append(owner) return kg
# -*- coding: utf-8 -*- import os LTP_DATA_DIR = '/home/lgx/Desktop/pyltp/ltp_data_v3.4.0' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') from pyltp import Segmentor from pyltp import Postagger str_1 = open('default').read() segmentor = Segmentor() # 初始化实例 postagger = Postagger() segmentor.load(cws_model_path) # 加载模型 postagger.load(pos_model_path) words = segmentor.segment(str_1) # 分词 postags = postagger.postag(words) f = open('result_1', "w") for i in range(0, len(words)): # print words[i] # print postags[i] # print '|' print >> f, '%s' % words[i] segmentor.release() # 释放模型 postagger.release()
# -*- coding: utf-8 -*- # @Time : 2019/11/7 14:04 # @Author : tian # @Email : [email protected] # @File : test2.py # @Software: PyCharm LTP_DATA_DIR = 'D:\BAK\LTP_3.4\ltp_data' # ltp模型目录的路径 #分词 import os cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 # 词性标注 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 # 命名实体识别 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 Data_path = "D:\代码库\-\文本分类器\分类结果\轻量敏感文本的分类结果.txt"
import os <<<<<<< HEAD LTP_DATA_DIR = r"C:\Users\yang\Desktop\trybot\tryaiml\ltp_data_v3\ltp_data_v3.4.0"#LTP模型目录 ======= LTP_DATA_DIR = ".\ltp_data_v3\ltp_data_v3.4.0"#LTP模型目录 >>>>>>> eb7150784644b3e65536d89bb0c63f05ad3f073b cws_model_path = os.path.join(LTP_DATA_DIR, "cws.model") #分词模型路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') #词性标注路径 pas_model_path = os.path.join(LTP_DATA_DIR, "parser.model") #依存句法路径 # srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` #分词实例 segmentor = Segmentor() segmentor.load(cws_model_path) #词性标注 postagger = Postagger() postagger.load(pos_model_path) #依存句法 parser = Parser() parser.load(pas_model_path) #命名实体识别 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 ''' #语义角色标注 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 '''
p_name = model.predict(vec) print p_name if __name__ == "__main__" : argp = argparse.ArgumentParser(description="Online Classification System") argp.add_argument("-c" , "--classname" , choices=[classname.JUNK , classname.SENSITIVE] , required=True , help="Classification Type , junk-class or sensitive class ") argp.add_argument("-s" , "--sample_interval_mode" , choices=[sampleIntervalMode.LINE_MODE , sampleIntervalMode.CRLF_MODE] , default=sampleIntervalMode.LINE_MODE , help="The mode with describes what is the inverval symbol between samples , default is LINE_MODE") argp.add_argument("-i" , "--input" , type=str , default="stdin" , help="'sysin' for using standard input ; else file path is needed.") args = argp.parse_args() logging.info("loadding segmentor") segmentor = Segmentor() segmentor.load(CWS_MODEL_PATH) logging.info("done") # loading model if args.classname == classname.JUNK : model = TFIDFModel() model.load_model(JUNK_MODEL_PATH) else : model = BOOLModel() model.load_model(SENSITIVE_MODEL_PATH) #process the input file if args.input == "stdin" : ifo = sys.stdin else :
class LtpParser: def __init__(self): LTP_DIR = "data\ltp_data" cws_model_path = os.path.join(LTP_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` lexicon_path = "dictionary\Dir1.txt" # 参数lexicon是自定义词典的文件路径 self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def seg_initialize(model_path, lexicon_path): print "load segment data..." segmentor = Segmentor() segmentor.load_with_lexicon(model_path, lexicon_path) return segmentor
def load_source(maindir, word_dict): n_processed = 0 contents_dict = {} segmentor = Segmentor() segmentor.load('/home/caory/github/table-detection/data/table-v5/ltp_data/cws.model') dirlist = os.listdir(maindir) for docid in dirlist: n_processed += 1 print('Load Source: doc: %s, rate: %.2f%%' % ( docid, 100.0 * n_processed / len(dirlist))) sys.stdout.flush() contents_dict[docid] = {} json_path = os.path.join(maindir, docid, 'pages_with_tables') if not os.path.exists(json_path): continue data = read_json(json_path) for pageid in data: contents_dict[docid][pageid] = {} size = data[pageid]['size'] texts, curves, others, tables = [], [], [], [] # 获取表格框 pad, offset = 2, 5 for box in data[pageid]['tables']: left = max(offset, int(math.floor(float(box[0])) - pad)) right = min(int(math.ceil(float(box[2])) + pad), size[0]-offset) top = max(offset, int(math.floor(float(size[1]-box[3])) - pad)) bottom = min(int(math.ceil(float(size[1]-box[1])) + pad), size[1]-offset) if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[1]: tables.append({'position': [left, right, top, bottom]}) # 获取文本框 for text in data[pageid]['texts']: # 获取每一个字符的位置 chars = [] for char in text['chars']: left = int(math.floor(float(char['box'][0]))) right = int(math.floor(float(char['box'][2]))) top = int(math.floor(float(size[1]-char['box'][3]))) bottom = int(math.floor(float(size[1]-char['box'][1]))) if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[1]: chars.append({'position': [left, right, top, bottom], 'sentence': char['text'].strip()}) # 对于距离近的字符进行合并 for char in chars: merged = False for i in range(len(texts)): box = texts[i] if char['position'][2] == texts[i]['position'][2] and \ char['position'][3] == texts[i]['position'][3] and \ text['type'] == texts[i]['type']: if abs(char['position'][0] - texts[i]['position'][1]) <= 5: texts[i]['position'][1] = char['position'][1] merged = True break elif abs(char['position'][1] - texts[i]['position'][0]) <= 5: texts[i]['position'][0] = char['position'][0] merged = True break if not merged: texts.append({'position': char['position'], 'type': text['type'], 'sentence': text['text'].strip()}) # 对于页码进行特殊识别 for i in range(len(texts)): top = texts[i]['position'][2] bottom = texts[i]['position'][3] if 1.0 * top / size[1] <= 0.85: continue is_page = True for j in range(len(texts)): if j == i: continue other_top = texts[j]['position'][2] other_bottom = texts[j]['position'][3] if other_bottom >= top: is_page = False break if is_page: texts[i]['type'] = 5 # 将下划线文本框改为表格框 new_texts = [] for text in texts: isline = True if 'sentence' in text and text['type'] == 2: for s in text['sentence']: if s != '_': isline = False if isline and len(text['sentence']) >= 3: pos = [text['position'][0], text['position'][1], text['position'][3]-1, text['position'][3]] curves.append({'position': pos, 'type': 1}) else: new_texts.append(text) else: new_texts.append(text) texts = new_texts # 获取其他框(图片等) for other in data[pageid]['others']: left = int(math.floor(float(other['box'][0]))) right = int(math.floor(float(other['box'][2]))) top = int(math.floor(float(size[1]-other['box'][3]))) bottom = int(math.floor(float(size[1]-other['box'][1]))) if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[1]: others.append({'position': [left, right, top, bottom], 'type': other['type']}) # 获取每一个线条的位置 curves = [] curve_width = 2 for curve in data[pageid]['curves']: left = int(math.floor(float(curve['box'][0]))) right = int(math.floor(float(curve['box'][2]))) top = int(math.floor(float(size[1]-curve['box'][3]))) bottom = int(math.floor(float(size[1]-curve['box'][1]))) if right - left <= curve_width and bottom - top > curve_width: right = left line = {'position': [left, right, top, bottom], 'type': curve['type']} elif right - left > curve_width and bottom - top <= curve_width: bottom = top line = {'position': [left, right, top, bottom], 'type': curve['type']} if line: if 0 <= line['position'][0] <= line['position'][1] < size[0] and \ 0 <= line['position'][2] <= line['position'][3] < size[1]: curves.append(line) words = [] for text in texts: if text['type'] == 2: ws = segmentor.segment(text['sentence'].encode('utf8')) uint = 1.0 * (text['position'][1] - text['position'][0]) / len(text['sentence']) \ if len(text['sentence']) != 0 else 0 ws = [w.decode('utf8') for w in ws] n_passed = 0 for w in ws: left = text['position'][0] + int(n_passed * uint) right = text['position'][0] + int((n_passed+len(w)) * uint) top = text['position'][2] bottom = text['position'][3] n_passed += len(w) if w in word_dict: color = [int(255 * t) for t in word_dict[w]] else: color = [150, 150, 150] words.append({'text': w, 'position': [left, right, top, bottom], 'color': color, 'type': 2}) contents_dict[docid][pageid] = { 'words': words, 'texts': texts, 'size': size, 'tables': tables, 'others': others, 'curves': curves} return contents_dict
# -*- coding:utf-8 -*- # segment from pyltp import Segmentor segmentor = Segmentor() segmentor.load('./ltp-model/cws.model') def segment(text): return segmentor.segment(text)
def word_vec_case_set(cls, word_model_file, with_name=False, merge_by='mosaic'): """ 获取词向量特征集,认为词条最多10个词 如果以mosaic方式,每个词条被表示为50*10=500维 如果以sum方式,每个词条被表示为50维 :param word_model_file: 词向量模型文件 :param with_name: 正样例是否包含人名 :param merge_by: 词条中词项量的结合方式,mosaic或sum :return: 一个字典{pos_case:{正例},neg:{负例}} """ segmentor = Segmentor() segmentor.load("../word2vec_process/model/cws.model") word_vec_model = word2vec.Word2Vec.load('../word2vec_process/model/' + word_model_file) case_dict = cls.load_case_set(with_name) word_vec_case_dict = {} if merge_by == 'mosaic': # 以词向量拼接的方式构建词条表示,500维 pos_case_list = case_dict['pos_case'] pos_case_vec_dict = {} for pos_case in pos_case_list: case_words = segmentor.segment(pos_case) case_vec = [] is_useful = 0 for word in case_words: try: # 拼接 case_vec.extend(word_vec_model[unicode(word)].tolist()) is_useful = 1 except Exception, e: with open("./data/not_in_vocabulary.txt", 'a') as out_file: # 记录缺失词汇 out_file.write(word + '\n') # 多退少补 if len(case_vec) > 500: case_vec = case_vec[0:500] else: while (len(case_vec) < 500): case_vec.append(0) if is_useful: pos_case_vec_dict[pos_case] = case_vec # 负样本 neg_case_list = case_dict['neg'] neg_case_vec_dict = {} for neg_case in neg_case_list: case_words = segmentor.segment(neg_case) case_vec = [] is_useful = 0 for word in case_words: try: # 拼接 case_vec.extend(word_vec_model[unicode(word)].tolist()) is_useful = 1 except Exception, e: with open("./data/not_in_vocabulary.txt", 'a') as out_file: # 记录缺失词汇 out_file.write(word + '\n') # 多退少补 if len(case_vec) > 500: case_vec = case_vec[0:500] else: while (len(case_vec) < 500): case_vec.append(0) if is_useful: neg_case_vec_dict[neg_case] = case_vec
def __init__(self): self.intents = [ 'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel', 'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message', 'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule', 'riddle', 'email', 'contacts', 'bus', 'website', 'datetime', 'poetry', 'lottery', 'chat', 'epg', 'telephone' ] self.segmentor = Segmentor() # 初始化实例 CWS self.segmentor.load(configs.cws_path) # 加载模型 self.postagger = Postagger() # 初始化实例 POS Tagger self.postagger.load(configs.pos_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 SRLer self.labeller.load(configs.srl_path) # 加载模型 self.parser = Parser() # 初始化实例 Parser self.parser.load(configs.parser_path) # 加载模型 self.ac = ACAutomatons() self.clf_31 = NBSVM() self.char_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-ch.pkl') self.word_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-wd.pkl') self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl') self.ch2_ = joblib.load(configs.models_path + '/nbsvm-feature_selector.pkl') self.word_vectorizer_tv = joblib.load(configs.models_path + '/vocab-wd_epg-tvchannel.pkl') self.char_vectorizer_tv = joblib.load(configs.models_path + '/vocab-ch_epg-tvchannel.pkl') self.clf_tv = joblib.load(configs.models_path + '/svm_epg-tvchannel.pkl') self.word_vectorizer_movie = joblib.load(configs.models_path + '/vocab-wd_video-cinemas.pkl') self.char_vectorizer_movie = joblib.load(configs.models_path + '/vocab-ch_video-cinemas.pkl') self.clf_movie = joblib.load(configs.models_path + '/svm_video-cinemas.pkl') self.char_vectorizer_internet = joblib.load( configs.models_path + '/vocab-ch_website-app.pkl') self.word_vectorizer_internet = joblib.load( configs.models_path + '/vocab-wd_website-app.pkl') self.clf_internet = joblib.load(configs.models_path + '/svm_website-app.pkl') self.char_vectorizer_star = joblib.load(configs.models_path + '/vocab-ch_video-music.pkl') self.clf_star = joblib.load(configs.models_path + '/svm_video-music.pkl') self.word_vectorizer_star = joblib.load(configs.models_path + '/vocab-wd_video-music.pkl') self.char_vectorizer_video = joblib.load(configs.models_path + '/vocab-ch_video-epg.pkl') self.word_vectorizer_video = joblib.load(configs.models_path + '/vocab-wd_video-epg.pkl') self.clf_video = joblib.load(configs.models_path + '/svm_video-epg.pkl')
# -*- coding: utf-8 -*- import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR = os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '平素体质:健康状况:良,既往有“高血压病史”多年' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load("/home/yhli/ltp_data/ltp_data_v3.4.0/cws.model") words = segmentor.segment(sentence) print("\t".join(words)) postagger = Postagger() postagger.load("/home/yhli/ltp_data/ltp_data_v3.4.0/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) parser = Parser() parser.load("/home/yhli/ltp_data/ltp_data_v3.4.0/parser.model") arcs = parser.parse(words, postags)
# -*- coding: utf-8 -*- ''' Created on 2018年11月17日 @author: Zhukun Luo Jiangxi university of finance and economics ''' import re import os import codecs import threading from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller from pyltp import SentenceSplitter LTP_DIR = 'D:\LTP\MODEL\ltp_data' # ltp模型目录的路径 segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 分词模型路径,模型名称为`cws.model` postagger = Postagger() postagger.load(os.path.join(LTP_DIR, "pos.model")) # 词性标注模型路径,模型名称为`pos.model` class CausalityExractor(): def __init__(self): pass '''1由果溯因配套式''' def ruler1(self, sentence): ''' conm2:〈[之]所以,因为〉、〈[之]所以,由于〉、 <[之]所以,缘于〉 [之]所以,因,[之]所以,归因于、[之]所以,由于、[之]所以,鉴于、[之]所以,由、 [之]所以,出于、 [之]所以,是因为 conm2_model:<Conj>{Effect},<Conj>{Cause}
import os from pyltp import Segmentor import data.TextRank # 定义映射函数 def sigmoid(x): x = float(x) y = 1 / (1 + math.exp((-x))) return y # 加载并设置pyltp LTP_DATA_DIR = 'D:\Souhu\ltp_data_v3.4.0\ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor = Segmentor() # segmentor.load(cws_model_path) segmentor.load_with_lexicon(cws_model_path, 'ExDictForCut_without_JiebaDefault.txt') # 评价分数的比例 rank = {'tfidf': 0.4, 'tr': 0.25, 'it': 0.2, 'pos': 0.15} # 加载用户词典 # jieba.load_userdict("ReducedDict_test3.txt") # 名词性词性加分列表 sp = (set)(['an', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 'vn', 'j']) kickout = (set)(['.', '。', ',', '-', '+', ':', ':', '“', '/', '*', '—', '…'])
# -*- coding:utf-8 -*- from pyltp import SentenceSplitter from pyltp import Segmentor import csvTools import re from string import digits from zhon.hanzi import punctuation from tqdm import tqdm import os segmentor = Segmentor() segmentor.load_with_lexicon('./ltp/cws.model','lexicon.txt') contexts2018 = csvTools.readCSV('2018all.csv') contexts2017 = csvTools.readCSV('2017.csv') contexts2016 = csvTools.readCSV('2016.csv') contexts = contexts2016+contexts2017+contexts2018 print(len(contexts)) # birthlist = [] descriptionlist = [] dignosislist = [] for context in contexts: # birth = context[3] description = context[6] dignosis = context[7] # birthlist.append(birth) descriptionlist.append(description) dignosislist.append(dignosis)
def split_sentence(self, sentence=None, say_word_list: List[str] = None, cycle: bool = True, ratio: float = None) -> None: """ 分词 :type say_word_list: :param sentence: :return: """ LTP_DATA_PATH = 'D:\pyltp-master\ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model') pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model') ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model') par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model') postagger = Postagger() postagger.load(pos_model_path) print('Postagger loaded!') parser = Parser() parser.load(par_model_path) print('Parser loaded!') segment = Segmentor() segment.load(cws_model_path) print('CWS loaded!') if cycle == True: try: lines = sentence sentence = list(segment.segment(lines)) # print('sen ok') # 找出相似 find_say_word = [ word for word in sentence if word in say_word_list ] if len(find_say_word) == 0: print('没有发现类似“说”的单词!') else: post_word = postagger.postag(sentence) post_word = list(post_word) # print('post ok') parse_word = parser.parse(sentence, post_word) parse_word = [(arc.head, arc.relation) for arc in parse_word] # print('parse ok') counter_index = 0 for index, word in enumerate(parse_word): location_part1 = '' location_part2 = '' location_part3 = '' # 找出第一个SBV下的"真新闻" if word[-1] == 'SBV': counter_index = word[0] location_part1 += sentence[index] location_part1 += sentence[word[0] - 1] break # 先将整个SBV后面碰到是双引号或者没有双引号的句子,用于后面文本向量的模型计算 # 暂时只提取双引号内容和两个句号结束的句子为数据 if sentence[counter_index] == '"': for index_2, word_2 in enumerate( sentence[counter_index + 1:]): if word_2 == '"': break location_part2 += word_2 else: for index_2, word_2 in enumerate( sentence[counter_index:]): if word_2 == '。': for word_4 in sentence[index_2 + 1:]: if word_4 == '。': break location_part3 += word_4 break location_part2 += word_2 # 判别说前后两个句号句子的相似度 cal_ratio = difflib.SequenceMatcher( None, location_part2, location_part3).ratio() if cal_ratio > ratio: result = location_part1 + location_part2 + location_part3 else: result = location_part1 + location_part2 segment.release() postagger.release() parser.release() return result.strip('\n') except Exception as e: print(e) elif cycle == False: print('不处理') else: raise TypeError('错误的输入类型') print('词标注和上下文定义结束') print('-' * 20, '华丽的分割线', '-' * 20)
else: word_freq_dict[word] = word_freq_dict[word] + 1 word_freq_dict = sorted(word_freq_dict.items(), key=lambda item: item[1], reverse=True) # print(word_pos_freq_dict) return word_freq_dict if __name__ == "__main__": """ 加载LTP的分词器和词性标注器 """ default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 segmentor = Segmentor() user_dict = "source\\user.txt" segmentor_flag = segmentor.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) postagger = Postagger() postag_flag = postagger.load(os.path.join(default_model_dir, 'pos.model')) path = r"D:\python-file\北京市旅游知识图谱\\verb-entity\\bj_travel" f = open('entity_verb_result\\' + "all_entity.json", 'r', encoding='utf-8') file = f.read() all_entity = json.loads(file)['all_entity'] f.close() file_list = os.listdir(path) all_documents = "" for file_name in file_list:
#!/usr/bin/python # -*- coding:utf-8 -*- import re, json, os import random import collections from pyltp import Segmentor LTP_DIR = "/home/zxsong/workspace/ltp_data_v3.4.0" segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) def split_sentences(article): ''' 对文章分句 :param article: str :return: list(str) ''' article = article.strip() para = re.sub('([。!!??\?])([^”’])', r"\1\n\2", article) # 单字符断句符 para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号 para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号 para = re.sub('([。!!??\?][”’])([^,。!!??\?])', r'\1\n\2', para) para = para.rstrip() return para.split("\n") def cut_sentence(sentence, cut_level="char"): ''' 对句子分词,采用字级别的分词方式 :param sentence: str :return: list(str)
import os LTP_DATA_DIR = '/Users/zhangshiwei/ltp_data_v3.4.0/' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`ner.model` from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer with open("预处理后数据.txt", "r", encoding='utf-8') as f1: words = f1.read() segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, 'dict') seg_list = segmentor.segment(words) # 分词 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(seg_list) # 词性标注 f2 = open("standrad_data.txt", "a", encoding='utf-8') for word, postag in zip(seg_list, postags): f2.write(word + " " + postag + "\n") f2.close() segmentor.release() # 释放模型 postagger.release() # 释放模型
# coding:utf-8 import math import copy from read_baselabel import read_baselabel from read_word2vec import read_word2vec import sys, os from operator import itemgetter from pyltp import Segmentor from collections import Counter segmentor = Segmentor() segmentor.load("/data0/shenyanjun/ltp_data/cws.model") path = os.path.abspath(os.path.dirname(sys.argv[0])) path_rule_for_stock = path + "/stock_to_theme.txt" path_base_label = path + "/stock_list_result.txt" path_word2vec = path + "/word2vec_item_only.txt" base_label = read_baselabel(path_base_label) base_label_dic, stock_names = base_label.transpose() word2vec = read_word2vec(path_word2vec) word2vec_dic = word2vec.read_w2v() def makeDict(path1): # 将规则存入dict,key为股票,value为股票可出现的概念 dict = {} fin = open(path1, "r") for line in fin: line1 = line.strip().split("\t")
f = open('../data/test_data_output.txt', 'w') if __name__ == '__main__': #导入模型 LTP_DATA_DIR = '../../../pyltp/model/ltp_data_v3.4.0/' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` # par model is bad! #par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 句法分析模型路径,模型名称为`parser.model` segmentor = Segmentor() # 初始化分词实例 segmentor.load(cws_model_path) # 加载分词模型 postagger = Postagger() # 初始化标注实例 postagger.load(pos_model_path) # 加载标注模型 #parser = Parser() # 初始化实例 #parser.load(par_model_path) # 加载模型 # 创建一个 XMLReader myparser = xml.sax.make_parser() # turn off namepsaces myparser.setFeature(xml.sax.handler.feature_namespaces, 0) # 重写 ContextHandler Handler = DataHandler() myparser.setContentHandler(Handler)
def __init__(self): self.mydict = {} self.segmentor = Segmentor() self.segmentor.load('cws.model') self.hash_dict() self.ltp_process()
# -*- coding: utf-8 -*- """ Created on Tue May 2 08:12:40 2017 @author: chenming """ # -*- coding: utf-8 -*- from pyltp import SentenceSplitter sents = SentenceSplitter.split('元芳你怎么看?我就趴窗口上看呗!') # 分句 print('\n'.join(sents)) #分词 # -*- coding: utf-8 -*- from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load('/Users/chenming/Spyder/3.3.1/ltp_data/cws.model') # 加载模型 words = segmentor.segment('元芳你怎么看') # 分词 print('\n'.join(words)) segmentor.release() # 释放模型 #使用分词外部词典 # -*- coding: utf-8 -*- from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon( '/Users/chenming/Spyder/3.3.1/ltp_data/cws.model', '/Users/chenming/Spyder/3.3.1/ltp_data/lexicon.txt') # 加载模型 words = segmentor.segment('亚硝酸盐是一种化学物质') print('\t'.join(words)) segmentor.release()
class pnn_count(): def __init__(self): self.mydict = {} self.lines = [] self.lines_num = 3000 self.c = [0,0,0] #PNN self.w_c = [{},{},{}] self.segmentor = Segmentor() self.segmentor.load('cws.model') self.read_file() self.train() self.test() def read_file(self): f = open('pnn_annotated.txt','r') self.lines = f.readlines() f.close() def train(self): for i in range(0,self.lines_num/5*4): line = self.lines[i] line.strip('\n') line_array = line.split('\t') line = line_array[1] words = self.segmentor.segment(line) if line_array[0] == '1': pos = 0 elif line_array[0] =='0': pos = 1 else: pos = 2 for i in words: #calculate frequency if self.w_c[pos].has_key(i): self.w_c[pos][i] += 1 else: for a in range(0,3): self.w_c[a][i] = 0 self.w_c[pos][i] += 1 self.c[pos] += 1 def test(self): count = 0 v = len(self.mydict.keys()) for a in range(self.lines_num / 5 * 4, len(self.lines)-1): wholeline = self.lines[a] print wholeline result = [0.0,0.0,0.0] line_array = wholeline.split('\t') line = line_array[1] words = self.segmentor.segment(line) for i in range(0,3): pci = 1.0 * self.c[i] / (self.lines_num/5 *4) pwci = 1.0 sum_i = 0 for q in self.w_c[i].keys(): sum_i += self.w_c[i][q] for k in words: if self.w_c[i].has_key(k): pwci = pwci * (self.w_c[i][k] + 1) / (sum_i + v) result[i] = pci * pwci maxi = 0 for i in range(0,3): if result[i]>result[maxi]: maxi = i if maxi ==0: if line_array[0] == '1': count += 1 print "my guess is positive" elif maxi==1: if line_array[0] == '0': count += 1 print "my guess is neuter" else: if line_array[0] == '-1': count += 1 print "my guess is negative" print count * 1.0 /(self.lines_num/5)
def bayes(self): segmentor = Segmentor() segmentor.load("cws.model") f = open('data/a_4.txt', 'r') # f = open('pnn_annotated.txt', 'r') # neutral, positive, negative class_freq = [0,0,0] # neutral, positive, negative word_total_count_freq = [0, 0, 0] each_word_count = [{}, {}, {}] accu = [0, 0] print 'train_set' for line in f: result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print line # neutral if result[0] == '0': class_freq[0] += 1 for word in ws_lst: word_total_count_freq[0] += 1 if each_word_count[0].get(word) is not None: # print 'Not none' each_word_count[0][word] += 1 else: # print 'None' each_word_count[0][word] = 1 # positive elif result[0] == '1': class_freq[1] += 1 for word in ws_lst: word_total_count_freq[1] += 1 if each_word_count[1].get(word) is not None: # print 'Not none' each_word_count[1][word] += 1 else: # print 'None' each_word_count[1][word] = 1 # negative elif result[0] == '-1': class_freq[2] += 1 for word in ws_lst: word_total_count_freq[2] += 1 if each_word_count[2].get(word) is not None: # print 'Not none' each_word_count[2][word] += 1 else: # print 'None' each_word_count[2][word] = 1 # print class_freq # print word_total_count_freq # print each_word_count print 'total' total_class_count = class_freq[0] + class_freq[1] + class_freq[2] total_word_count = word_total_count_freq[0] + word_total_count_freq[1] + word_total_count_freq[2] print total_class_count # print total_word_count f.close() f1 = open('a_1.txt', 'r') # 中性 积极, , 消极 # neutral, positive, negative orgin = [0, 0, 0] # 本来有多少积极消极 judge = [0, 0, 0] # 判断出来了多少积极消极 judge_right = [0, 0, 0] print 'test_set_now' for line in f1: result = line.split('\t') # print result[1] ws_lst = segmentor.segment(result[1]) # print test_line[test_count] max = 0 tmp_result = 0 for test_iter in range(3): processed_wst = [] prob_this_class = 1 for test_word in ws_lst: if test_word not in processed_wst: prob_this_class *= (each_word_count[test_iter].get(test_word, 0) + 1.0) / float(word_total_count_freq[test_iter] + total_word_count) processed_wst.append(test_word) prob_this_class *= (float(class_freq[test_iter]) / float(total_class_count)) if prob_this_class > max: max = prob_this_class tmp_result = test_iter if tmp_result == 0: test_result = '0' judge[0] += 1 elif tmp_result == 1: test_result = '1' judge[1] += 1 elif tmp_result == 2: test_result = '-1' judge[2] += 1 if result[0] == test_result: accu[0] += 1 else: accu[1] += 1 if result[0] == '0': orgin[0] += 1 elif result[0] == '1': orgin[1] += 1 elif result[0] == '-1': orgin[2] += 1 if result[0] == '0' == test_result: judge_right[0] += 1 elif result[0] == '1' == test_result: judge_right[1] += 1 elif result[0] == '-1' == test_result: judge_right[2] += 1 # print 'result is %s'%test_result # print 'count are %d, %d'%(accu[0], accu[1]) # print 'accuracy so far: %f'%(float(accu[0]) / float(accu[0] + accu[1])) f1.close() print 'orgin' print orgin print 'judge' print judge print 'judge_right' print judge_right print 'total' print accu print 'accuracy this time is %f'%((float(accu[0]) / float(accu[0] + accu[1])))
# -*- coding: utf-8 -*- import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags)
class PreProcessor(object) : def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) : self.raw_data = None self.processed_data = None self.words_dict = None self.STOP_WORDS = self._load_stop_words(stop_words_dir) self.segmentor = Segmentor() self.segmentor.load(cws_model_path) def _load_stop_words(self , dir_name) : stop_words = set() cur_abs_dir_path = os.path.split(os.path.abspath(__file__))[0] dir_path = os.path.join(cur_abs_dir_path , dir_name) for file_name in os.listdir(dir_path) : file_path = os.path.join(dir_path , file_name) with open(file_path) as f : for line in f : word = line.strip() stop_words.add(word) for symbol in SENT_SPLIT_SYMBOLS : stop_words.add(symbol) return stop_words def load_raw_data(self , path) : with open(path) as f : self.raw_data = json.load(f) def _split_sentence(self , content) : ''' split content to sentence ''' sents = [] paras = content.split("\n") for paragraph in paras : split_rst = re.split(ur"[%s]+" %(SENT_SPLIT_SYMBOLS) , paragraph) # has space sents.extend(split_rst) return sents def _segment(self , unicode_line) : ''' return : list of words ''' utf8_line = unicode_line.strip().encode("utf8") words = list(self.segmentor.segment(utf8_line)) return words def _make_doc_data(self , url , title_seged , sents_seged) : return { 'url' : url , 'title' : title_seged , 'content' : sents_seged } def _add_word2words_dict(self , words) : for word in words : if word not in self.STOP_WORDS : word = word.lower() self.words_dict.add(word) def do_preprocessing(self) : logging.info("do preprocessing ...") self.processed_data = dict() self.words_dict = set() for page_id , page_data in self.raw_data.items() : url = page_data['url'] title = page_data["title"] content = page_data["content"] sents = self._split_sentence(content) # segment title_words = self._segment(title) content_words = [] for sent in sents : content_words.extend(self._segment(sent)) content_words.append(" ") # another space to avoid that they become one line when merging at output snippet self.processed_data[page_id] = self._make_doc_data(url , title_words , content_words) self._add_word2words_dict(title_words + content_words) logging.info('done.') def save_doc_data(self , to_path) : logging.info("saving doc data to ` %s `" %(to_path) ) with open(to_path , 'w') as of: json.dump(self.processed_data , of ) logging.info("done.") def save_words_dict(self , to_path) : logging.info("saving words dict to ` %s `" %(to_path)) words_list = list(self.words_dict) words_dict = {word : word_id for word_id , word in enumerate(words_list) } with open(to_path , 'w') as of : json.dump(words_dict , of , ensure_ascii=False) # json not support `set` logging.info("done.")
#!/usr/bin/env python # coding: utf-8 from pyltp import Segmentor segmentor = Segmentor() segmentor.load('/downloads/cws.model') def segment(text): if isinstance(text, unicode): text = text.encode('utf-8') words = segmentor.segment(text) return map(lambda x: x.decode('utf-8'), words)
# -*- coding: utf-8 -*- from pyltp import Segmentor segmentor = Segmentor() segmentor.load("/Users/lzy/Code/ltp_model/cws.model") def word_seg(line,label="0"): words = segmentor.segment(line) s=" ".join(words) return s
def main(): f = open("psgs.txt", "r") lines = [line.rstrip() for line in f.readlines()] f.close() segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) f = open("../questions/q_facts_segged_clf.txt", "r") types = f.readlines() f.close() f = open("../questions/provided/q_facts.txt", "r") questions = [line.rstrip() for line in f.readlines()] f.close() f = open("psgs_segged.txt", "w") fans = open("zhidao_answer.txt", "w") i = 0 qid = 0 flag = 0 while i < len(lines): line = lines[i] if (i % 50000 == 0): print "\r#\t%d" % i, sys.stdout.flush() if line.startswith("<question"): qid = int(line.split(" ")[1].split("=")[1].split(">")[0]) flag = 0 f.write(line + "\n") elif line.startswith("</doc") or line.startswith("</question"): f.write(line + "\n") elif line.startswith("<doc"): f.write(line + "\n" + lines[i+1] + "\n") i += 2 else: L = len(line) s = 0 for s in range(L): if line[s:].startswith("最佳答案:") \ or line[s:].startswith("[专业]答案")\ or line[s:].startswith("、"+questions[qid-1]): break if line[s:].startswith("最佳答案"): s += 14 elif line[s:].startswith("[专业]答案"): s += 15 elif line[s:].startswith("、"+questions[qid-1]): s += len(questions[qid-1])+1 if s < L and flag == 0: t = s + 1 while t < L and line[t:].startswith("更多") == False\ and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\ and not line[t:].startswith("~")\ and not line[t:].startswith("?")\ and not line[t:].startswith("!")\ and not line[t:].startswith("。"): t += 1 if s < t and t-s < 200 and t-s > 1: ans = line[s:t].rstrip(".。 ??,,") if types[qid-1].rstrip() == "Q_number": ans = first_con_number(ans) fans.write("%d\t%s\n" % (qid, ans)) flag = 1 # words = segmentor.segment(line) # postags = postagger.postag(words) # for j in range(len(words)): # f.write("%s/%s\t" % (words[j], postags[j])) # f.write("\n") i += 1 f.close() fans.close()
ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR = os.path.join(ROOTDIR, "./ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller if __name__ == '__main__': paragraph = '他叫汤姆去拿外衣。' # --------------------- 断句 ------------------------ sentence = SentenceSplitter.split(paragraph)[0] # -------------------- Context Manager ------------- with Segmentor(os.path.join(MODELDIR, "cws.model")) as s: words = s.segment(sentence) print("\t".join(words)) # --------------------- 分词 ------------------------ segmentor = Segmentor(os.path.join(MODELDIR, "cws.model")) segmentor_with_vocab = Segmentor( os.path.join(MODELDIR, "cws.model"), lexicon_path='lexicon.txt', # 分开的会合并在一起 ) segmentor_with_force_vocab = Segmentor( os.path.join(MODELDIR, "cws.model"), force_lexicon_path='lexicon.txt' # 除上述功能外,原本合并在一起的亦会拆分 )
#coding:utf-8 # coding=gbk import codecs import sys reload(sys) sys.setdefaultencoding('gbk') import os LTP_DATA_DIR = 'D:/coding/Python2.7/ltp_data_v3.4.0' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` from pyltp import Segmentor #分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 NewsName = ['hpv疫苗','iPhone X', '乌镇互联网大会','九寨沟7.0级地震','俄罗斯世界杯',\ '双十一购物节', '德国大选', '战狼2', '权力的游戏', '李晨求婚范冰冰', '江歌刘鑫',\ '王宝强马蓉离婚案', '百度无人驾驶汽车', '红黄蓝幼儿园', '绝地求生 吃鸡', '英国脱欧',\ '萨德系统 中韩', '雄安新区', '功守道', '榆林产妇坠楼'] for news in NewsName: print news howmuch = os.listdir(unicode('../News/' + news, 'utf8')) howmuch = len(howmuch) - 2 #该新闻有多少篇 DIR = '../News/' + news + '/' F1 = '../Ngrams/Processed/' + news + '/words.txt' F2 = '../Ngrams/Processed/' + news + '/dict.txt' dirpath = './Processed/' + news
from scipy.sparse.csc import csc_matrix from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectKBest, chi2 from scipy.sparse import hstack import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR = os.path.join("/home/fish/", "ltp_data") from pyltp import Segmentor, Postagger, NamedEntityRecognizer # @UnresolvedImport # 分词功能 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) def ltp(sentence): words = segmentor.segment(sentence) # 词性标注功能 postags = postagger.postag(words) # 实体识别 netags = recognizer.recognize(words, postags) l = [] li = zip(list(words), list(postags), list(netags))
for argument in arguments: arguments_list.append( argument.getAttribute("content").encode("utf-8")) print("加入的元素为(" + argument.getAttribute("content").encode("utf-8") + ")") relations_item = [] relations_item.append(trigger_list) relations_item.append(arguments_list) relations_item.append(len(sentence_list) - 1) print("关系对对应的句子id为:" + str(relations_item[2])) relations_list.append(relations_item) print("一共提取到(" + str(len(relations_list)) + ")组事件对") MODELDIR = "/media/lyt312323529/c4175817-9d97-490b-95c6-636149e75a87/Graph_Generate/ltp_data" print("正在加载LTP模型...") segmentor = Segmentor() p = os.path.join(MODELDIR, "cws.model") segmentor.load(p) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) print("加载完毕") print "\n处理新闻标题" f = open( "/media/lyt312323529/c4175817-9d97-490b-95c6-636149e75a87/Graph_Generate/title.txt", "r") lines = f.readlines() title_trigger = [] title_ner = []
emotion_set = [] for line in path.readlines(): emotion_set.append((line.strip().split('\t')[0])) return emotion_set def sortByPMI(coPMI): sorted_tuple =[] for item in coPMI: items = item.split('\001') #print 'item:',items,type(items) #print coPMI[item],type(coPMI[item]) sorted_tuple.append((items[0],items[1],coPMI[item])) return sorted(sorted_tuple,key =itemgetter(0,2)),sorted(sorted_tuple,key= itemgetter(1,2)) segmentor = Segmentor() segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt") if __name__ == "__main__": path = os.path.abspath(os.path.dirname(sys.argv[0])) path_property = open(path+"/car_entity_property.txt",'r') pro_words = fun_property_set(path_property) path_sentiment = open(path+"/car_sentiment_dic.txt",'r') sen_words = fun_emotion_set(path_sentiment) path_corpus = path+"/car_pmi_corpus.txt" path_out1 = open(path+"/pro_sen_pmi_corpus_sort1.txt",'w') path_out2 = open(path+"/pro_sen_pmi_corpus_sort2.txt",'w') posPmi = getPMI(path_corpus, pro_words, sen_words)
class RequestHandler(): def __init__(self): self.intents = [ 'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel', 'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message', 'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule', 'riddle', 'email', 'contacts', 'bus', 'website', 'datetime', 'poetry', 'lottery', 'chat', 'epg', 'telephone' ] self.segmentor = Segmentor() # 初始化实例 CWS self.segmentor.load(configs.cws_path) # 加载模型 self.postagger = Postagger() # 初始化实例 POS Tagger self.postagger.load(configs.pos_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 SRLer self.labeller.load(configs.srl_path) # 加载模型 self.parser = Parser() # 初始化实例 Parser self.parser.load(configs.parser_path) # 加载模型 self.ac = ACAutomatons() self.clf_31 = NBSVM() self.char_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-ch.pkl') self.word_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-wd.pkl') self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl') self.ch2_ = joblib.load(configs.models_path + '/nbsvm-feature_selector.pkl') self.word_vectorizer_tv = joblib.load(configs.models_path + '/vocab-wd_epg-tvchannel.pkl') self.char_vectorizer_tv = joblib.load(configs.models_path + '/vocab-ch_epg-tvchannel.pkl') self.clf_tv = joblib.load(configs.models_path + '/svm_epg-tvchannel.pkl') self.word_vectorizer_movie = joblib.load(configs.models_path + '/vocab-wd_video-cinemas.pkl') self.char_vectorizer_movie = joblib.load(configs.models_path + '/vocab-ch_video-cinemas.pkl') self.clf_movie = joblib.load(configs.models_path + '/svm_video-cinemas.pkl') self.char_vectorizer_internet = joblib.load( configs.models_path + '/vocab-ch_website-app.pkl') self.word_vectorizer_internet = joblib.load( configs.models_path + '/vocab-wd_website-app.pkl') self.clf_internet = joblib.load(configs.models_path + '/svm_website-app.pkl') self.char_vectorizer_star = joblib.load(configs.models_path + '/vocab-ch_video-music.pkl') self.clf_star = joblib.load(configs.models_path + '/svm_video-music.pkl') self.word_vectorizer_star = joblib.load(configs.models_path + '/vocab-wd_video-music.pkl') self.char_vectorizer_video = joblib.load(configs.models_path + '/vocab-ch_video-epg.pkl') self.word_vectorizer_video = joblib.load(configs.models_path + '/vocab-wd_video-epg.pkl') self.clf_video = joblib.load(configs.models_path + '/svm_video-epg.pkl') def getResult(self, sentence): """1. Complete the classification in this function. Args: sentence: A string of sentence. Returns: classification: A string of the result of classification. """ processed = self.preprocess(sentence) return self.pipeline(processed) def getBatchResults(self, sentencesList): """2. You can also complete the classification in this function, if you want to classify the sentences in batch. Args: sentencesList: A List of Dictionaries of ids and sentences, like: [{'id':331, 'content':'帮我打电话给张三' }, {'id':332, 'content':'帮我订一张机票!' }, ... ] Returns: resultsList: A List of Dictionaries of ids and results. The order of the list must be the same as the input list, like: [{'id':331, 'result':'telephone' }, {'id':332, 'result':'flight' }, ... ] """ resultsList = [] for sentence in sentencesList: resultDict = {} resultDict['id'] = sentence['id'] resultDict['result'] = self.getResult(sentence['content']) resultsList.append(resultDict) return resultsList def pattern_match(self, sample): srl_res = self.sRLMatch(sample) if srl_res != None: return srl_res else: rul_res = self.ruleMatch(sample) if rul_res != None: return rul_res else: return None def ruleMatch(self, sample): domains = get_rule(sample['query'], self.ac) if len(domains) < 1: return None else: sorted_domains = aggregate_domains(domains) for each in sorted_domains: if each[0] == 'datetime': nouns = get_nouns(sample['query'], 'festival', self.ac) if len(nouns) > 0: return 'datetime' else: continue elif each[0] == 'email': if len( set(sample['word']) & set(['写', '回复', '转发', '打开', '查收', '查看', '答复']) ) > 0: return 'email' else: continue else: return None def sRLMatch(self, sample): srl_res = getSRL(sample['query'], self.segmentor, self.postagger, self.parser, self.labeller) if len(srl_res) == 0: #no any predicate in query or single entity return None else: for res in srl_res: predicate_domains = get_predicate(res[0], self.ac) if len(predicate_domains) < 1: continue #no such a predicate in database else: sorted_domains = aggregate_domains(predicate_domains) for each in sorted_domains: if each[0] == 'app': nouns = get_nouns(res[1], 'app', self.ac) if len(nouns) > 0: return 'app' else: continue elif each[0] == 'cinemas': nouns = get_nouns(res[1], 'film', self.ac) if len(nouns) > 0: return 'Movie_stuff' else: continue elif each[0] == 'contacts': # 'nr' by POS-tagger indicates a person's name if 'nr' in sample['tag']: return 'contacts' else: continue elif each[0] == 'cookbook': nouns = get_nouns(res[1], 'food', self.ac) if len(nouns) > 0: # 如果命中任何专有名词,则划分到意图app return 'cookbook' else: continue elif each[0] == 'tvchannel': nouns = get_nouns(res[1], 'tvchannel', self.ac) if len(nouns) > 0: return 'TV_stuff' else: continue elif each[0] == 'video': nouns = get_nouns(res[1], 'video', self.ac) if len(nouns) > 0: return 'Video_stuff' else: continue elif each[0] == 'health': nouns = get_nouns(res[1], 'disease', self.ac) nouns.extend(get_nouns(res[1], 'drug', self.ac)) if len(nouns) > 0: return 'health' else: continue elif each[0] == 'music': nouns_song = get_nouns(res[1], 'song', self.ac) nouns_singer = get_nouns(res[1], 'singer', self.ac) if len(nouns_song) > 0: return 'music' elif len(nouns_singer) > 0: return 'Star_stuff' else: continue elif each[0] == 'novel': nouns = get_nouns(res[1], 'novel', self.ac) if '小说' in res[1] or len(nouns) > 0: return 'novel' else: continue elif each[0] == 'poetry': nouns = get_nouns(res[1], 'poet', self.ac) if len(nouns) > 0: return 'poetry' else: continue elif each[0] == 'radio': if len(get_nouns(res[1], 'radio', self.ac)) > 0: return 'radio' else: continue elif each[0] == 'stock': nouns = get_nouns(res[1], 'stock', self.ac) if len(nouns) > 0: return 'stock' else: continue elif each[0] == 'website': nouns = get_nouns(res[1], 'website', self.ac) if len(nouns) > 0: return 'Internet_stuff' else: continue def retrieval(self, sample): """ To find proper nouns to handle single entity in a query :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ pn_res = doRetrieval(sample['query'], self.ac) #look up single instance sorted_domains = aggregate_domains(pn_res) if len(sorted_domains) == 1: #one instance domain = sorted_domains[0][0] if len(max(sorted_domains[0][1], key=len)) > len(sample['query']) / 2: if domain == 'airline': return 'flight' if domain in ['railwaystation', 'airport']: return 'map' if domain == 'app': return 'app' if domain == 'contacts': return 'contacts' if domain in ['drug', 'disease']: return 'health' if domain == 'festival': return 'datetime' if domain in ['moviestar', 'film', 'video']: return 'video' if domain == 'food': return 'cookbook' if domain == 'novel': return 'novel' if domain == 'place': return 'map' if domain == 'poet': return 'poetry' if domain == 'radio': return 'radio' if domain in ['singer', 'song']: return 'music' if domain == 'sports': return 'match' if domain == 'stock': return 'stock' if domain == 'tvchannel': return 'tvchannel' if domain == 'website': return 'website' return None else: return None def classifyAllIntents(self, sample): """ A classifier for 31 intents including chitchat :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_31.transform(text) test_wd = self.word_vectorizer_31.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) test_vec = self.ch2_.transform(test_vec) pred = self.clf_31.predict(test_vec) return pred.tolist()[0] def epgOrTvchannel(self, sample): """ A classifier to label a instance with 'epg' or 'tvchannel' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_tv.transform(text) test_wd = self.word_vectorizer_tv.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_tv.predict(test_vec) return pred.tolist()[0] def videoOrCinemas(self, sample): """ A classifier to label a instance with 'video' or 'cinemas' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_movie.transform(text) test_wd = self.word_vectorizer_movie.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_movie.predict(test_vec) return pred.tolist()[0] def websiteOrApp(self, sample): """ A classifier to label a instance with 'website' or 'app' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_internet.transform(text) test_wd = self.word_vectorizer_internet.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_internet.predict(test_vec) return pred.tolist()[0] def videoOrMusic(self, sample): """ A classifier to label a instance with 'video' or 'music' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_star.transform(text) test_wd = self.word_vectorizer_star.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_star.predict(test_vec) return pred.tolist()[0] def videoOrEpg(self, sample): """ A classifier to label a instance with 'epg' or 'video' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_video.transform(text) test_wd = self.word_vectorizer_video.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_video.predict(test_vec) return pred.tolist()[0] def pipeline(self, sample, use_pse=True, use_retrieval=False): """ A pipeline to label a instance with one of 31 possible intents :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ if use_pse: ps_res = prettySureExpression(sample['query'], self.ac) if len(list(set([_[1][0] for _ in ps_res]))) == 1: return ps_res[0][1][0] pm_res = self.pattern_match(sample) if pm_res == 'TV_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['epg', 'tvchannel']: return clf_res else: return self.epgOrTvchannel( sample) #a ML classifier to label epg or tvchannel elif pm_res == 'Movie_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'cinemas']: return clf_res else: return self.videoOrCinemas(sample) elif pm_res == 'Internet_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['website', 'app']: return clf_res else: return self.websiteOrApp(sample) elif pm_res == 'Star_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'music']: return clf_res else: return self.videoOrMusic(sample) elif pm_res == 'Video_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'epg']: return clf_res else: return self.videoOrEpg(sample) elif pm_res == None: if use_retrieval: ret_res = self.retrieval(sample, self.ac) if ret_res == None: return self.classifyAllIntents( sample ) # no pattern matched, so that classify it using ML else: return ret_res else: return self.classifyAllIntents(sample) else: return pm_res def preprocess(self, raw_query): """ To segment a raw user query into words and POS-tags it :param raw_query: a string generated by a user :return: a dict indicate the segmented query ,raw query and POS-tags """ tmp = pseg.cut(raw_query) words = [] pos = [] for word, flag in tmp: words.append(word) pos.append(flag) inst = {} inst['tag'] = pos inst['word'] = words del words del pos inst['query'] = raw_query return inst def close(self): """ To release relevant models """ self.postagger.release() # 释放模型 self.segmentor.release() # 释放模型 self.labeller.release() # 释放模型 self.parser.release() # 释放模型 del self.ac gc.collect()
# kernel method for select term import re import random import os from Config import * from pyltp import Segmentor segmentor = Segmentor() segmentor.load('./model/cws.model') def answer(questionDict): ### return answer for select term question ''' for each in questionDict: print each,questionDict[each] ''' candidateTermList = generate_candidate_term(questionDict['options']) compareSentenceList = generate_compare_sentence(questionDict['body'],candidateTermList) scoreList = rnnlm_score(compareSentenceList) answer = find_best_option(questionDict['options'],candidateTermList,scoreList) #print 'answer',answer return answer def generate_candidate_term(optionList): ### generatee candidata term ### return candidate term list. [[A1,A2],[B1,B2],[C1,C2]] optionList = option_list_regular(optionList) candidateTermList = [] # insert empty list for sentence length for i in range(len(optionList[0])):
# -*- coding: utf-8 -*- from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller segmentor = Segmentor() segmentor.load('E:/Python/pyltp/ltp_data/cws.model') words = segmentor.segment('中华人民共和国成立了。我今天高兴地去和志伟约午饭。') print '\t'.join(words) segmentor.release() postagger = Postagger() postagger.load('E:/Python/pyltp/ltp_data/pos.model') postags = postagger.postag(words) print '\t'.join(postags) postagger.release() parser = Parser() parser.load('E:/Python/pyltp/ltp_data/parser.model') arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) parser.release() recognizer = NamedEntityRecognizer() recognizer.load('E:/Python/pyltp/ltp_data/ner.model') netags = recognizer.recognize(words, postags) print "\t".join(netags) recognizer.release() labeller = SementicRoleLabeller() labeller.load('E:/Python/pyltp/ltp_data/srl/') roles = labeller.label(words, postags, netags, arcs)