def seg(content): # Set your own model path MODELDIR="/home/liuqi/ltp/pyltp/ltp_data/" segmentor = Segmentor() segmentor.load(MODELDIR+"cws.model") tWords = segmentor.segment(content) return tWords
def split_words(sentence = "中国进出口银行与中国银行加强合作",type_list=0): """分词,若type_list=True,则返回以列表返回分词后的结果。""" segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) if type_list: return [i for i in words] return words
class pnn_count(): def __init__(self): self.mydict = {} self.segmentor = Segmentor() self.segmentor.load('cws.model') self.hash_dict() self.ltp_process() def ltp_process(self): sentence_num = 0 right_num = 0; f = open('pnn_annotated.txt','r') for line in f: sentence_num += 1 #print line line_array = line.split('\t') line = line_array[1] count = 0 words = self.segmentor.segment(line) for i in words: if self.mydict.has_key(i): count = count + self.mydict[i] if count > 0: answer = "positive" if line_array[0] == '1': right_num += 1 elif count == 0: answer = "neuter" if line_array[0] == '0': right_num += 1 else: answer = "negative" if line_array[0] == '-1': right_num += 1 #print "My guess is %s" %answer #print "THe right answer is %s" %line_array[0] #print "result %d" % count f.close() print "total sentence is %d, right answer is %d" %(sentence_num,right_num) def hash_dict(self): f = open('negative.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = -1 f.close() f = open('positive.txt','r') for line in f: line = line.strip('\n') line = line.strip('\r') self.mydict[line] = 1 f.close()
def process(index): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) finname = "o_"+str(index)+".txt" foutname = "p_"+str(index)+".txt" print finname count = 0 fin = codecs.open(finname, encoding='utf-8') with codecs.open(foutname, 'w', encoding="utf-8") as fout: while 1: line = fin.readline() if not line: break tmp = line.split(" ^ {")[1] # Get JSON tmp = "{"+tmp data = json.loads(tmp) content = data['content'] # error_correction(content) content = content.strip() segmentation = "" for line in content.split("\n"): line = line.encode("utf-8") words = segmentor.segment(line) segmentation += "/".join(words) segmentation += "/" # Return type of the function is str, not unicode. Thus need to change into unicode. segmentation = unicode(segmentation, "utf-8") pinyin = add_pinyin(segmentation) obj = {} obj['flavor'] = data['flavor'] obj['environment'] = data['environment'] obj['service'] = data['service'] obj['content'] = data['content'] obj['segmentation'] = segmentation obj['pinyin'] = pinyin tmpstr = json.dumps(obj,ensure_ascii=False) fout.write(tmpstr) fout.write('\n') count += 1 print count segmentor.release()
def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
def pyltp_words(): from pyltp import Segmentor, Postagger segmentor = Segmentor() segmentor.load("/home/fredgan/github/pyltp/ltp_data/cws.model") # postagger = Postagger() # postagger.load("~/github/pyltp/ltp_data/cpos.model") for line in open(sys.argv[1], 'r'): try: style,sentence = line.strip().split('\t') except: continue style_dic.setdefault(style, {}) words = segmentor.segment(sentence) # postags = postagger.postag(words) for w in words: if w in style_dic[style]: style_dic[style][w] += 1 else: style_dic[style][w] = 1 for k,v in style_dic.iteritems(): v_list = sorted(v.iteritems(), key = lambda d:d[1], reverse = True) print k+ "\t" + " ".join(map(lambda i:i[0] + ":" +str(i[1]), v_list[0:min(50,len(v_list))]))
def main(): f = open("psgs.txt", "r") lines = [line.rstrip() for line in f.readlines()] f.close() segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) f = open("../questions/q_facts_segged_clf.txt", "r") types = f.readlines() f.close() f = open("../questions/provided/q_facts.txt", "r") questions = [line.rstrip() for line in f.readlines()] f.close() f = open("psgs_segged.txt", "w") fans = open("zhidao_answer.txt", "w") i = 0 qid = 0 flag = 0 while i < len(lines): line = lines[i] if (i % 50000 == 0): print "\r#\t%d" % i, sys.stdout.flush() if line.startswith("<question"): qid = int(line.split(" ")[1].split("=")[1].split(">")[0]) flag = 0 f.write(line + "\n") elif line.startswith("</doc") or line.startswith("</question"): f.write(line + "\n") elif line.startswith("<doc"): f.write(line + "\n" + lines[i+1] + "\n") i += 2 else: L = len(line) s = 0 for s in range(L): if line[s:].startswith("最佳答案:") \ or line[s:].startswith("[专业]答案")\ or line[s:].startswith("、"+questions[qid-1]): break if line[s:].startswith("最佳答案"): s += 14 elif line[s:].startswith("[专业]答案"): s += 15 elif line[s:].startswith("、"+questions[qid-1]): s += len(questions[qid-1])+1 if s < L and flag == 0: t = s + 1 while t < L and line[t:].startswith("更多") == False\ and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\ and not line[t:].startswith("~")\ and not line[t:].startswith("?")\ and not line[t:].startswith("!")\ and not line[t:].startswith("。"): t += 1 if s < t and t-s < 200 and t-s > 1: ans = line[s:t].rstrip(".。 ??,,") if types[qid-1].rstrip() == "Q_number": ans = first_con_number(ans) fans.write("%d\t%s\n" % (qid, ans)) flag = 1 # words = segmentor.segment(line) # postags = postagger.postag(words) # for j in range(len(words)): # f.write("%s/%s\t" % (words[j], postags[j])) # f.write("\n") i += 1 f.close() fans.close()
# list = [] # with open(file, 'r') as f: # lines = f.readlines() # for line in lines: # str = line.split('\t')[1] # try: # list.index(str) # except: # list.append(str) # print(str) # print(list) #[ '食品餐饮','旅游住宿', '金融服务', '医疗服务', '物流快递'] from pyltp import Segmentor, Postagger seg = Segmentor() seg.load('cws.model') poser = Postagger() poser.load('pos.model') real_dir_path = os.path.split(os.path.realpath(__file__))[0] #文件所在路径 stop_words_file = os.path.join(real_dir_path, 'stopwords.txt') #定义允许的词性 allow_pos_ltp = ('a', 'i', 'j', 'n', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz', 'v', 'ws') def cut_stopword_pos(s): words = seg.segment(''.join(s.split())) poses = poser.postag(words) stopwords = {}.fromkeys([ line.rstrip() for line in open(stop_words_file, 'r', encoding='UTF-8') ])
# -*- coding:utf-8 -*- # segment from pyltp import Segmentor segmentor = Segmentor() segmentor.load('./ltp-model/cws.model') def segment(text): return segmentor.segment(text)
class AutoExtraction: """ 新闻言论自动抽取 """ def __init__(self): """ 初始化模型 """ self.seg_sent = SentenceSplitter() # 分句 self.seg = Segmentor() # 分词 self.seg.load(cws_model_path) self.pos = Postagger() # 词性标注 self.pos.load(pos_model_path) self.ner = NamedEntityRecognizer() # 命名实体识别 self.ner.load(ner_model_path) self.par = Parser() # 依存分析 self.par.load(par_model_path) self.similar_word = load_similar_word() # 读取相似词列表 def _sentence_split(self, content): sentences = self.seg_sent.split(content) return [s for s in sentences if len(s) != 0] def _del_punctuation(self, sent): """ 1.分词 2.移除标点符号 3.再次分词 """ sent_seg = self._cut(sent) item_list = [ item.strip() for item in sent_seg if item.strip() not in all_punc ] sent_seg = self._cut(''.join(item_list)) return sent_seg def _cut(self, sent): return ' '.join(self.seg.segment(sent)) def _pos(self, sent): words = sent.split(' ') pos_tags = self.pos.postag(words) return list(pos_tags) def _ner(self, sent, pos_tags): sentence_tag = self.ner.recognize(sent.split(' '), pos_tags) return list(sentence_tag) def _par(self, sent, sentence_tag): arcs = self.par.parse(sent, sentence_tag) return [(arc.head, arc.relation) for arc in arcs] @classmethod def _exist_ner(cls, sentence_tag): """ 判断句子的ner结果是否存在实体,并返回实体内容 """ # Ni Ns Nhr ner_dic = defaultdict(int) ner_set = [ 'S-Ni', 'S-Ns', 'S-Nh', 'B-Ni', 'B-Ns', 'B-Nh', 'I-Ni', 'I-Ns', 'I-Nh', 'E-Ni', 'E-Ns', 'E-Nh' ] i = 0 while i < len(sentence_tag): for j in range(i, len(sentence_tag)): if sentence_tag[j] not in ner_set: break if j == i: i += 1 else: ner_dic[i] = j i = j return ner_dic @classmethod def _tf_idf(cls, text_list): """ 计算tf-idf """ tf_idf = TfidfVectorizer() return tf_idf.fit_transform(text_list) @classmethod def _cosine_sim(cls, x1, x2): """ 文本相似性 """ return cosine_similarity(x1, x2) def _has_next_sentence(self, x1, x2, threshold): """ 判断是否有下一句话 """ sim = self._cosine_sim(x1, x2)[0][0] if sim > threshold: print(sim) return True return False def process(self, content): """ content: 输入的新闻预料 return: 输出人物和对应言论 """ # 1. 分句 sents = self._sentence_split(content) # 2. 分词、去标点 sents_ = [self._del_punctuation(s) for s in sents] # 3. 词性标注 postags = [self._pos(s) for s in sents_] # 4. 命名实体识别 netags = [self._ner(s, p) for s, p in zip(sents_, postags)] # 5. 依存句法分析 arcs_list = [ self._par(w.split(' '), n) for w, n in zip(sents_, netags) ] # 6. tf-idf tf_idf_vec = self._tf_idf(sents_) extract_result = [] for index, netag in enumerate(netags): ner_dic = self._exist_ner(netag) # print(ner_dic) if not ner_dic: # 判断是否存在实体 continue words = sents_[index].split(' ') # print(words) subject_verb = defaultdict(int) # (i, arc[0]-1) for i, arc in enumerate(arcs_list[index]): if arc[1] == 'SBV': # [(subject_index, verb_index),...] if (arc[0] - 1) not in subject_verb.keys(): subject_verb[arc[0] - 1] = i else: if i > subject_verb[arc[0] - 1]: subject_verb[arc[0] - 1] = i # print('words:{}\n ner:{}\n arcs:{}\n s_b:{}\n'.format(words,netags[index], arcs,subject_verb)) for v, s in subject_verb.items(): # 根据句法分析获得的 实体索引 s 和 动词索引 v if words[v] in self.similar_word: # 判断动词是否为相似词 # print('s:{},v:{}'.format(s,v)) # 如果SBV的 subject 不在实体,则任选一个距离s和v最近的实体作为 subject if s in ner_dic.keys(): subject = ''.join(words[s:ner_dic[s]]) else: l = [(n[0], n[1], s - n[0]) for n in list(ner_dic.items()) if n[0] < v and n[0] < s] if l: # 如果前面不存在实体, 则选择非实体词 start, end, _ = min(l, key=lambda x: x[2]) subject = ''.join(words[start:end]) else: subject = words[s] said = words[v] # 判断下一句话是否与当前是同一个语境:1)存在下一句话 2)两句话相似 3)下一句不存在实体 speech = sents[index].split(words[v])[1] if index < len(netags) - 1 and self._has_next_sentence( tf_idf_vec[index], tf_idf_vec[index + 1], 0.1) and not self._exist_ner(netags[index + 1]): # print('similar:{},{}'.format(sents[index], sents[index + 1])) speech += sents[index + 1] extract_result.append((subject, said, speech)) return extract_result def release(self): """ :释放模型 """ self.seg.release() self.pos.release() self.ner.release() self.par.release()
ret = [] for text_word in text_words: if text_word not in stopwords: ret.append(text_word) return ret def write_result(data: list): """ 将结果写入文件 :param data: 结果 :return: None """ with open("data/preprocessed.json", "w", encoding="utf-8") as f: for line in data: f.write(json.dumps(line, ensure_ascii=False) + "\n") if __name__ == "__main__": print("Loading stop words and data!") stop_words = get_stop_words() need_segs = get_need_seg_file() print("Initializing Segmentor!") segmentor = Segmentor() segmentor.load(cws_model_path) print("Segmenting!") results = seg(stop_words, need_segs, segmentor) segmentor.release() write_result(results) print("Finish!")
from pyltp import Segmentor, Postagger, Parser import os import pickle LTP_DATA_DIR = "D:\myprojects\LTP\ltp_data_v3.4.0" cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) #加载模型 postagger = Postagger() postagger.load(pos_model_path) parser = Parser() parser.load(par_model_path) ##===========去除前后多余部分==========## # content=[ # [('现在', 'nt'), ('已经', 'd'), ('坏', 'a'), ('了', 'u'), (',', 'wp'), ('假', 'a'), ('的', 'u'), ('数据线', 'n')] # ... # ] content = [] sen_feature = [] #对每个句子,保存分词,词性,依存句法[, , []] with open('./data/file.txt', 'rt', encoding='utf-8') as f1: for l in f1: line = l.split(',', 4)[4][2:-4] #去除前后多余部分 words = segmentor.segment(line) #分词 postags = postagger.postag(words) #词性标注 arcs = list(parser.parse(words, postags)) # 句法分析 word_pos = list(zip(words, postags))
import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
class LtpParser: def __init__(self): print(111) LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) print(111) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
data_processed.close() return entities def dump_result_to_csv(target): writer = csv.writer(target, delimiter=',') writer.writerow(['entity', 'times', 'length', 'source']) for k, v in sorted_entities: writer.writerow([k, v[0], len(k), v[1]]) if __name__ == '__main__': segmentor = Segmentor() # Initialize model postagger = Postagger() recognizer = NamedEntityRecognizer() segmentor.load('cws.model') # Load model postagger.load('pos.model') recognizer.load('ner.model') # Load raw data data_csv = pd.read_csv('../data.csv') # Fetch column 'title' datas = data_csv['title'] # Run in different method according to arg.method if args.method == 'postagger': entities = PostagResult(datas, postagger, segmentor) target_file = open('target_postagger.csv', 'w', encoding='utf-8-sig') elif args.method == 'recognizer': entities = NameEntityResult(datas, postagger, segmentor, recognizer)
import random import os, re from tqdm import tqdm from pyltp import Segmentor LTP_DIR = "/home/zxsong/workspace/ltp_data_v3.4.0" segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) def split_sentences(article): ''' 对文章分句 :param article: str :return: list(str) ''' article = article.strip() para = re.sub('([。!!??\?])([^”’])', r"\1\n\2", article) # 单字符断句符 para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号 para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号 para = re.sub('([。!!??\?][”’])([^,。!!??\?])', r'\1\n\2', para) para = para.rstrip() return para.split("\n") def cut_sentence(sentence, cut_level): ''' 对句子分词,采用字级别的分词方式 :param sentence: str :return: list(str) ''' # TTnews含有特殊符号<Paragraph>,将该符号作为特殊token
# -*- coding: utf-8 -*- from pyltp import Segmentor import jieba import sys segmentor = Segmentor() segmentor.load("model/cws.model") reload(sys) sys.setdefaultencoding('utf8') # words = segmentor.segment("元芳你怎么看") # print words # print "|".join(words) # segmentor.release() def simple_word_segment(): """ 对wiki语料简体中文内容按照ltp默认方式分词, 也就是未添加词典也未使用个性化分词 :return: """ with open('corpus/wiki_hans.txt', 'r') as in_file: count = 0 for line in in_file.readlines(): count += 1 if count % 1000 == 0: print count try:
class SentenceParse(ModelPath): def __init__(self): # 分词 self.segmentor = Segmentor() self.segmentor.load(self.ltp_cws) # 词性标注 self.postagger = Postagger() self.postagger.load(self.ltp_pos) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ltp_ner) # 依存句法分析 self.parser = Parser() self.parser.load(self.ltp_parser) # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.load(self.ltp_pisrl) def __call__(self, sentence, *args, **kwargs): words = self.get_words(sentence) postags = self.get_postagger(words) ner = self.get_recognizer(words, postags) arcs = self.get_parse(words, postags) # roles = self.get_rolelabel(words, postags, arcs) parse_result = { "sentence": sentence, "words": words, "postags": postags, "ner": ner, "arcs": [(arc.head, arc.relation) for arc in arcs], # "roles": [ # [role.index, [[arg.name, (arg.range.start, arg.range.end)] for arg in role.arguments]] for role in roles # ], } # print("*"*80) # for k, v in parse_result.items(): # print(k, v) return parse_result # , [arg.name, [arg.range.start, arg.range.end]] # for arg in role.arguments] def get_sentences(self, news): """ 分句 :param news: str 新闻文本 :return: list 句子列表 """ return list(SentenceSplitter.split(news)) def get_words(self, sentence): """ 分词 :param sentence: str 句子 :return: list 分词列表 """ return list(self.segmentor.segment(sentence)) def get_postagger(self, words): """ 词性标注 :param words: :return: list 词性 """ return list(self.postagger.postag(words)) def get_recognizer(self, words, postags): """ 命名实体识别 :param words: list 词列表 :param postags: list 词性列表 :return: list 命名实体列表 """ return list(self.recognizer.recognize(words, postags)) def get_parse(self, words, postags): """ 依存句法分析 :param words: list 词列表 :param postags: list 词性列表 :return: 依存关系 """ arcs = self.parser.parse(words, postags) return arcs def get_rolelabel(self, words, postags, arcs): """ 语义角色标注 :param words: list 词列表 :param postags: list 词性列表 :param arcs: 依存句法分析结果 :return: 语义角色标注 """ roles = self.labeller.label(words, postags, arcs) return roles def release(self): """释放模型""" self.segmentor.release() self.recognizer.release() self.parser.release() self.postagger.release() self.labeller.release()
# -*- coding: utf-8 -*- from pyltp import Segmentor modelPath = './data/ltp_data_v3.4.0/cws.model' segmentor = Segmentor() segmentor.load(modelPath) sent = '在包含問題的所有解的解空間樹中,按照深度優先搜尋的策略,從根節點出發深度優先搜尋解空間樹' words = segmentor.segment(sent) print(' | '.join(words))
# coding:utf-8 import gensim from pyltp import Segmentor from numpy import * # 用于矩阵运算 import copy # 用于深复制 import os qin = 0 # 改成0是刘辉的路径,否则是秦文涛的路径 # 加载分词模型 segmentor = Segmentor() if qin == 1: segmentor.load('D:/coding/Python2.7/ltp_data_v3.4.0/cws.model') model = gensim.models.Word2Vec.load('../Sentence/model_qin') else: segmentor.load('/Users/liuhui/Desktop/实验室/LTP/ltp_data_v3.4.0/cws.model') model = gensim.models.Word2Vec.load('../Sentence/model') vec_size = 100 stoplist = {} f = open('../stopword.txt', 'r') for line in f: word = line.strip() stoplist[word] = 1 f.close() class Sent: def __init__(self, _newsid, _globalid, _paraid, _localid, _sentnum, _content, _vec): self.newsid = _newsid # 该句所属新闻编号 self.globalid = _globalid # 该句在该篇新闻的第几句
def get_vec(self,filename): def list2str(l): r_str = ' ' for i in l: r_str += str(int(i.split(':')[0]) + 10) +':'+ i.split(':')[1]+ ' ' return r_str def adjust_list(l,words): l.insert(0,'"') n = len(l) cut_list = [] cut_list.append(l[0]) for i in range(1,n): a = '' if cut_list[-1]+l[i] in words: cut_list[-1] = cut_list[-1]+l[i] continue while l[i] not in words and len(l[i]) > 1: a = l[i][-1] + a l[i] = l[i][:-1] if l[i] in words: cut_list.append(l[i]) if a != '': if a in words: cut_list.append(a) # print(cut_list) return cut_list def get_tf_idf(data,syn_dict): all_words = [] dict_freq = {} # 词频 dict_doc = {} # 文档数量 all_num = 0 words = [] stop_words = self.stop_word tf_idf = {} all_sent = [] for k in data.keys(): # print(data[k]) line_list = (data[k].replace('[','').replace(']','').split('", ')) line_list[-1] = line_list[-1][:-1] for i in line_list: line_i = i.split('\t')[1] + '"' all_sent.append(line_i) cut_line = '\t'.join(segmentor.segment(line_i)) words_list = cut_line.split('\t') # 分词 is_adddoc = [] for word in words_list: if word not in stop_words: if word not in dict_freq.keys(): dict_freq[word] = 1 else: dict_freq[word] += 1 if word not in is_adddoc: # 词出现的问题树 if word not in dict_doc.keys(): dict_doc[word] = 1 else: dict_doc[word] += 1 is_adddoc.append(word) for k in dict_freq.keys(): idf = math.log(self.N / dict_doc[k]) tf_idf[k] = 1 + math.log(dict_freq[k]) tf_idf[k] *= idf with open('AC/tf-idf.txt','w') as fr: for k in tf_idf.keys(): fr.write(k) fr.write('\t') fr.write(str(tf_idf[k])) fr.write('\n') def get_feature_vec(q_list,a_list): feature = [] q_den = 1 for word in q_list: q_den += tf_idf[word]**2 for sa_list in a_list: vec_f = 0 a_den = 1 for wa in sa_list: a_den += tf_idf[wa]**2 if wa in set(q_list): vec_f += tf_idf[wa] den = (q_den * a_den)**0.5 vec_f /= den feature.append(round(vec_f*1000,2)) return feature def get_feature_bm25(q_list,a_list,all_words): all_wordsl = list(all_words) # print(all_wordsl[174017]) feature = [] s = BM25(a_list,all_wordsl) # s.simall(q_list) # print(s.simall(q_list)) for i in s.simall(q_list): feature.append(i) # print(feature) return feature # def get_feature_sim(q_list,a_list): # feature = [] # str_q = '' # for q in q_list: # str_q = str_q + ' ' + q # for as_list in a_list: # str_sa = '' # for a in as_list: # str_sa = str_sa + ' ' + a # # print(q_list,as_list,synonyms.compare(q_list,as_list)) # if len(str_sa) < 1 or len(str_q) < 1: # feature.append(0.0) # else: # feature.append(round(synonyms.compare(str_q, str_sa,seg=False)*1000,3)) # return feature def get_feature_same(q,a_list): r = [] for sa_list in a_list: n = 0 for a in sa_list: if a in q: n += 1 r.append(n) return r def get_DA(words_list): postags = postagger.postag(words_list) # 词性标注 pos_line = '\t'.join(postags) pos_list = pos_line.split('\t') # print(pos_list) # print(pos_list) if pos_list == ['']: return [] netags = recognizer.recognize(words_list, pos_list) # 命名实体识别 ner_line = '\t'.join(netags) ner_list = ner_line.split('\t') arcs = parser.parse(words_list, pos_list) # 句法分析 arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in arcs) arcs_list = arcs_line.split('\t') r = [] rsyn = [] for i in range(len(arcs_list)): # print(words_list[int(arcs_list[i][0])-1] + '_' + words_list[i],arcs_list[i][0]) if pos_list[i][0] in set({'n','v','a'}): r.append(words_list[int(arcs_list[i][0]) - 1] + '_' + words_list[i]) return r def get_feature_DA(q_list,a_list): feature = [] feature_q = get_DA(q_list) feature_a = [] for sa_list in a_list: feature_sa = get_DA(sa_list) # print(feature_q) # print(feature_sa) score = 0.0 n = 0 for sa in feature_sa: for q in feature_q: n += 1 # print(sa,q) if sa == q: score += 1 elif sa.split('_')[0] == q.split('_')[0]: score += 0.5 elif sa.split('_')[1] == q.split('_')[1]: score += 0.5 else: n -= 1 # print(score,n) if score > 0.4: feature.append(score/n) else: feature.append(0.0) # print(feature) return feature fd = open(filename, 'r') data = [] data_dict = {} for line in fd: # print(line[:-1]) # print(line[:-1].split('\t')[1]) data.append(line[:-1]) for i in range(0,len(data),2): data_dict[data[i]] = data[i+1] segmentor = Segmentor() segmentor.load('cws.model') postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('ner.model') # 加载模型 parser = Parser() parser.load('parser.model') tf_idf = {} all_word = [] answer_vec = [] fti = open('AC/tf-idf.txt', 'r') for line in fti: k = line[:-1].split('\t')[0] v = line[:-1].split('\t')[1] tf_idf[k] = round(float(v), 2) all_word.append(k) all_word = set(all_word) j = 0 for k in data_dict.keys(): # print(k,data_dict[k]) # print(j) pos_a = [] cut_line = '\t'.join(segmentor.segment(k.split('\t')[1][1:-1])) words_list = cut_line.split('\t') # 分词 words_list = adjust_list(words_list, all_word) q_list = [] a_list = [] for word in words_list: if word not in self.stop_word: q_list.append(word) line_list = (data_dict[k].replace('[', '').replace(']', '').split('", ')) line_list[-1] = line_list[-1][:-1] for i in line_list: sa_list = [] line_i = i.split('\t')[1] + '"' i_n = i.split('\t')[0] pos_a.append(i_n) cut_line = '\t'.join(segmentor.segment(line_i[1:-1])) words_list = cut_line.split('\t') # 分词 words_list = adjust_list(words_list, all_word) for word in words_list: if word not in self.stop_word: sa_list.append(word) a_list.append(sa_list) # print(q_list) # print(a_list) feature_same = get_feature_same(k.split('\t')[1][1:-1],a_list) feature_vec = get_feature_vec(q_list,a_list) feature_bm25 = get_feature_bm25(q_list,a_list,all_word) # feature_sim = get_feature_sim(q_list, a_list) feature_DA = get_feature_DA(q_list,a_list) for ni in range(len(pos_a)): answer_vec.append(pos_a[ni] + ' 1:' + str(feature_vec[ni]) + ' 2:' + str(feature_DA[ni]) + ' 3:' + str(feature_same[ni]) + list2str(feature_bm25[ni])) j += 1 if j % 500 == 0: print(j) # if j == 5: # break with open('AC/train.txt', 'w') as fw: for avec in answer_vec: fw.write(avec) fw.write('\n')
def build_feature(is_train=True): """ 从初始数据中抽取特征 :param is_train: 训练模式标记 :return: 将提取到的特征写入文件 """ print("Initializing Segmentor!") segmentor = Segmentor() segmentor.load(cws_model_path) # 读取train json文件 if is_train: with open(TRAIN_DATA, 'r', encoding='utf-8') as f: questions = [json.loads(line.strip()) for line in f.readlines()] else: with open(SEARCH_RESULT, 'r', encoding='utf-8') as f: questions = [json.loads(line.strip()) for line in f.readlines()] questions.sort(key=lambda item_: item_['qid']) # 按qid升序排序 # 读入passage json文件 passage = {} with open(SEG_PASSAGE_DATA, encoding='utf-8') as f: for line in f.readlines(): read = json.loads(line.strip()) passage[read['pid']] = read['document'] # 读入raw passage json文件 passage_raw = {} with open(RAW_PASSAGE_DATA, encoding='utf-8') as f: for line in f.readlines(): read = json.loads(line.strip()) passage_raw[read['pid']] = read['document'] # 建立特征矩阵 feature = [] ret = [] for k in range(len(questions)): question = questions[k] sents, corpus = [], [] if is_train: cv = CountVectorizer(token_pattern=r"(?u)\b\w+\b") cv.fit(passage[question['pid']]) tv = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b") tv.fit(passage[question['pid']]) for sent in passage[question['pid']]: corpus.append(sent.split()) else: for pid in question['answer_pid']: sents += passage[pid] for sent in passage[pid]: corpus.append(sent.split()) if len(sents) == 0: # 没有检索到文档 print("no answer pid: {}".format(question['qid'])) continue cv = CountVectorizer(token_pattern=r"(?u)\b\w+\b") cv.fit(sents) tv = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b") tv.fit(sents) # 提取 BM25 特征 bm25_model = bm25.BM25(corpus) q = list(segmentor.segment(question['question'])) scores = bm25_model.get_scores(q) if is_train: for i in range(len(passage[question['pid']])): ans_sent = passage[question['pid']][i] feature_array = extract_feature(q, ans_sent, cv, tv) feature_array.append(scores[i]) feature.append(' '.join([str(attr) for attr in feature_array]) + '\n') sen = {} if passage_raw[ question['pid']][i] in question['answer_sentence']: sen['label'] = 1 else: sen['label'] = 0 sen['qid'] = question['qid'] sen['question'] = question['question'] sen['answer'] = passage[question['pid']][i] ret.append(sen) else: for i in range(len(sents)): feature_array = extract_feature(q, sents[i], cv, tv) feature_array.append(scores[i]) feature.append(' '.join([str(attr) for attr in feature_array]) + '\n') sen = { 'label': 0, 'qid': question['qid'], 'question': question['question'], 'answer': sents[i] } ret.append(sen) # 特征写入文件 feature_path = RAW_FEATURE if is_train else TEST_FEATURE with open(feature_path, 'w', encoding='utf-8') as f: f.writelines(feature) # 句子写入文件 sentence_path = RAW_SENTENCE if is_train else TEST_SENTENCE with open(sentence_path, 'w', encoding='utf-8') as f: for sample in ret: f.write(json.dumps(sample, ensure_ascii=False) + '\n') segmentor.release()
# -*- coding: utf-8 -*- from pyltp import Segmentor segmentor = Segmentor() segmentor.load("/Users/xiamin/Downloads/ltp_data/cws.model") words = segmentor.segment("元芳你怎么看") print "|".join(words) segmentor.release()
class tokenization(): def __init__(self): self.LTP_DATA_DIR = "/home/mm/Downloads/ltp_data_v3.4.0/" self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(self.cws_model_path) # 加载模型 self.train_res = self.read_train_res() # 读取tag文本,防止里面有空格去掉空格 # self.all_co_names = self.FDDC_co_list() def read_train_res(self): with open( '/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train' ) as rf: train_res = rf.read() train_res = re.sub(r'\(', '(', train_res) train_res = re.sub(r'\)', ')', train_res) return train_res def tokenize_enti(self, path11): texx, entity_string = convert2txt(path11) # sentences = re.split(r'。', texx) # sentences.sort(key=len, reverse=True) entities = list(set(re.split(r'[\s~、,;/]', entity_string))) entities.sort(key=len) entities_arrows_list = list( set([ x if '~' in x else '' for x in re.split(r'\s', entity_string) ])) entities_arrows_list.sort(key=len, reverse=True) entities_arrows_list = entities_arrows_list[:-1] # 找出结果数据行并且把最后的回车符号去掉 patt_index = re.findall(r'\d{4,10}', path11)[0] res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index), self.train_res) # 以下是整理train——res # 遍历结果,发现有简称全称的,把匹配的另一半加进去。 """主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现, 使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先 把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的""" res_paired = {} # 临时定义一个res的列表,存储修改后的train res for x in range(len(res_rows)): res_row = res_rows[x] for y in range(6): res_paired[str(x) + str(y)] = [re.split(r'\t', res_row)[y]] for arrow_str in entities_arrows_list: for index, result_row in enumerate(res_rows): for indi, res_value in enumerate(re.split(r'\t', result_row)): if indi in [0, 1, 4, 5]: continue res_value_list = res_value.split('、') for res_value_split in res_value_list: if res_value_split in entities and res_value_split in arrow_str: # 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue niki, fullna = re.split(r'~', arrow_str) fullna_first = fullna.split(',')[0] niki_split_list = re.split(r'[/、]', niki) # 对应的全称满足三个条件,长度/逗号 以及含有简称的几个字 if res_value_split in niki_split_list \ and len(fullna_first) < 18 \ and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first): res_paired[str(index) + str(indi)].append(fullna_first) """ 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/, 含有这几个字的要剔除 """ if res_value_split == fullna_first: # 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于 for niki_split in niki_split_list: if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split)\ and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)',re.sub(r'\s', '', niki_split)): res_paired[str(index) + str(indi)].append( niki_split) # 遍历公告的每一句,把每一句送进模型。 # words_n_words = '' # for i in sentences: words = self.segmentor.segment(texx) words = ' '.join(words) # 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程 # # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来, # for ent in entities: # # 把words中所有是实体的中间去掉空格。使用双层sub # # 正则还是要多注释啊 # """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?” # 表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words # 中出现这个序列的地方,将其换成没空格的""" # if len(ent) > 1: # if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent): # 如果没有股权关键字,直接加上空格匹配pattern # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词 # patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent)) # else: # patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent) # # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用 # words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words) # 然后把空格都换成回车,words竖起来了。 # words = re.sub(r'\s', '\n', words) # words = re.sub(r'\n+', '\n', words) """把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o 目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束, 但是这样的正则太难了, 我无法将所有合适的实体 全部抽出来,而导致标注的缺失,那么还是把任务给模型了""" # for x in range(len(res_rows)): # for y in range(6): # index = str(x)+str(y) # tags_list = res_paired[index] for index, tags_list in res_paired.items(): # 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。 for sub_res in sorted(tags_list, key=len, reverse=True): if not index.endswith('0') and len(sub_res) > 1: patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res) if re.search(r'{}'.format(patt_sub_res), words): spliter = re.findall(patt_sub_res, words)[0] words_split_list = re.split(spliter, words) spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]), spliter) words = spliter_tagged.join(words_split_list) # print(words) # words=re.sub(patt_sub_res, sub_res) # words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words) # train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o words = re.sub(r'\s', '\to\n', words) words = re.sub(r'(?<=屄\d)', '\n', words) words = re.sub(r'屄', '\t', words) # words_n_words += words # print(words) with open( '/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/' + res_paired['00'][0] + '.txt', 'w') as af: af.write(words) print(path11.split("/")[-1])
import sys import os import re import numpy as np from pyltp import Segmentor segmentor = Segmentor() segmentor.load("/home/zzj/ltp_data_v3.4.0/cws.model") dir = './tokenized_dir/' filelist = os.listdir(dir) abstract_avr_len = 0 article_avr_len = 0 coun = 0 # def clear_str(line): # emoji_pattern = re.compile( # u"(\ud83d[\ude00-\ude4f])|" # emoticons # u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) # u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) # u"(\ud83d[\ude80-\udeff])|" # transport & map symbols # u"(\ud83c[\udde0-\uddff])|" # u"[\U00010000-\U0010ffff]" # flags (iOS) # "+", flags=re.UNICODE) # # return emoji_pattern.sub(r'', line) # def tokenize_stories(stories_dir,file): # """将weibo文件夹下对应文件分词,存储在tokenized_stories_dir/下一个个文件""" # # num = 0 # stories = [] # stories.append(file)
class PyltpAnalyzer(object): def __init__(self, fileDir=LTP_DATA_DIR): """ :param filename: """ print('77777&777777777777777') self.fileDir = fileDir # 初始化分词实例 self.cws_model_path = os.path.join( self.fileDir, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) # 加载模型 # 初始化标注实例 self.pos_model_path = os.path.join( self.fileDir, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.postagger = Postagger() self.postagger.load(self.pos_model_path) # 加载模型 # 初始化命名实体识别实例 self.ner_model_path = os.path.join( self.fileDir, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 加载模型 #依存句法分析 self.par_model_path = os.path.join( self.fileDir, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` self.parser = Parser() # 初始化实例 self.parser.load(self.par_model_path) # 加载模型 def loadSegmentorUserdict(self, user_dict): """ 载入用户分词词典 :param user_dict: :return: """ self.segmentor.load_with_lexicon(self.cws_model_path, user_dict) def segmentSentence(self, sentence): return list(self.segmentor.segment(sentence)) def segment(self, sentences): """ :param sentences: 句子列表 :return:句子分词结果 """ wordsList = [] if sentences: for sentence in sentences: wordsList.append(list(self.segmentor.segment(sentence))) return wordsList def postag(self, wordsList): """ :param wordsList: 句子分词列表 :return: 句子分词词性标注结果 """ postagsList = [] if wordsList: for words in wordsList: postagsList.append(list(self.postagger.postag(words))) return postagsList def recognize(self, wordsList, postagsList): """ :param wordsList: 句子分词列表 :param postagsList: 句子标注列表 :return: 句子命名实体识别结果 """ netagsList = [] if wordsList and postagsList: if len(wordsList) == len(postagsList): for words, postags in zip(wordsList, postagsList): netagsList.append( list(self.recognizer.recognize(words, postags))) else: print( "wordsList = {} ,len(wordsList) = {} and postagsList = {} ,len(postagsList)" .format(wordsList, len(wordsList), postagsList, len(postagsList))) else: print("wordsList = {} and postagsList = {}".format( wordsList, postagsList)) return netagsList def dependencyParse(self, wordsList, postagsList): """ :param wordsList: 句子分词列表 :param postagsList: 句子标注列表 :return: 句子句法分析结果 """ arcsList = [] if wordsList and postagsList: if len(wordsList) == len(postagsList): for words, postags in zip(wordsList, postagsList): arcsList.append(list(self.parser.parse( words, postags))) #arc.head 父节点, arc.relation 依存关系 else: print( "wordsList = {} ,len(wordsList) = {} and postagsList = {} ,len(postagsList)" .format(wordsList, len(wordsList), postagsList, len(postagsList))) else: print("wordsList = {} and postagsList = {}".format( wordsList, postagsList)) return arcsList def finalize(self): """ 释放所有没用到的模型 :return: """ self.segmentor.release() # 释放分词模型 self.postagger.release() # 释放词性模型 self.recognizer.release() # 释放命名实体模型 self.parser.release() # 释放依存句法模型
arguments_list.append( argument.getAttribute("content").encode("utf-8")) print("加入的元素为(" + argument.getAttribute("content").encode("utf-8") + ")") relations_item = [] relations_item.append(trigger_list) relations_item.append(arguments_list) relations_list.append(relations_item) print("一共提取到(" + str(len(relations_list)) + ")组事件对") arguments_list = [] MODELDIR = "/media/lyt312323529/c4175817-9d97-490b-95c6-636149e75a87/Graph_Generate/ltp_data" print("正在加载LTP模型...") segmentor = Segmentor() p = os.path.join(MODELDIR, "cws.model") segmentor.load(p) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) print("加载完毕") events_list = [] for i in range(len(relations_list)): event = [] trigger = [] print("\n\n处理触发词结果如下") if (relations_list[i][0][0] != "is") and (relations_list[i][0][0] != "de"): words = segmentor.segment(relations_list[i][0][0]) #wordStr = "\t".join(words) #print(wordStr)
def load_source(maindir, word_dict): n_processed = 0 contents_dict = {} segmentor = Segmentor() segmentor.load( '/home/caory/github/table-detection/data/table-v5/ltp_data/cws.model') dirlist = os.listdir(maindir) for docid in dirlist: n_processed += 1 print('Load Source: doc: %s, rate: %.2f%%' % (docid, 100.0 * n_processed / len(dirlist))) sys.stdout.flush() contents_dict[docid] = {} json_path = os.path.join(maindir, docid, 'pages_with_tables') if not os.path.exists(json_path): continue data = read_json(json_path) for pageid in data: contents_dict[docid][pageid] = {} size = data[pageid]['size'] texts, curves, others, tables = [], [], [], [] # 获取表格框 pad, offset = 2, 5 for box in data[pageid]['tables']: left = max(offset, int(math.floor(float(box[0])) - pad)) right = min(int(math.ceil(float(box[2])) + pad), size[0] - offset) top = max(offset, int(math.floor(float(size[1] - box[3])) - pad)) bottom = min(int(math.ceil(float(size[1] - box[1])) + pad), size[1] - offset) if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[ 1]: tables.append({'position': [left, right, top, bottom]}) # 获取文本框 for text in data[pageid]['texts']: # 获取每一个字符的位置 chars = [] for char in text['chars']: left = int(math.floor(float(char['box'][0]))) right = int(math.floor(float(char['box'][2]))) top = int(math.floor(float(size[1] - char['box'][3]))) bottom = int(math.floor(float(size[1] - char['box'][1]))) if 0 <= left <= right < size[ 0] and 0 <= top <= bottom < size[1]: chars.append({ 'position': [left, right, top, bottom], 'sentence': char['text'].strip() }) # 对于距离近的字符进行合并 for char in chars: merged = False for i in range(len(texts)): box = texts[i] if char['position'][2] == texts[i]['position'][2] and \ char['position'][3] == texts[i]['position'][3] and \ text['type'] == texts[i]['type']: if abs(char['position'][0] - texts[i]['position'][1]) <= 5: texts[i]['position'][1] = char['position'][1] merged = True break elif abs(char['position'][1] - texts[i]['position'][0]) <= 5: texts[i]['position'][0] = char['position'][0] merged = True break if not merged: texts.append({ 'position': char['position'], 'type': text['type'], 'sentence': text['text'].strip() }) # 对于页码进行特殊识别 for i in range(len(texts)): top = texts[i]['position'][2] bottom = texts[i]['position'][3] if 1.0 * top / size[1] <= 0.85: continue is_page = True for j in range(len(texts)): if j == i: continue other_top = texts[j]['position'][2] other_bottom = texts[j]['position'][3] if other_bottom >= top: is_page = False break if is_page: texts[i]['type'] = 5 # 将下划线文本框改为表格框 new_texts = [] for text in texts: isline = True if 'sentence' in text and text['type'] == 2: for s in text['sentence']: if s != '_': isline = False if isline and len(text['sentence']) >= 3: pos = [ text['position'][0], text['position'][1], text['position'][3] - 1, text['position'][3] ] curves.append({'position': pos, 'type': 1}) else: new_texts.append(text) else: new_texts.append(text) texts = new_texts # 获取其他框(图片等) for other in data[pageid]['others']: left = int(math.floor(float(other['box'][0]))) right = int(math.floor(float(other['box'][2]))) top = int(math.floor(float(size[1] - other['box'][3]))) bottom = int(math.floor(float(size[1] - other['box'][1]))) if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[ 1]: others.append({ 'position': [left, right, top, bottom], 'type': other['type'] }) # 获取每一个线条的位置 curves = [] curve_width = 2 for curve in data[pageid]['curves']: left = int(math.floor(float(curve['box'][0]))) right = int(math.floor(float(curve['box'][2]))) top = int(math.floor(float(size[1] - curve['box'][3]))) bottom = int(math.floor(float(size[1] - curve['box'][1]))) if right - left <= curve_width and bottom - top > curve_width: right = left line = { 'position': [left, right, top, bottom], 'type': curve['type'] } elif right - left > curve_width and bottom - top <= curve_width: bottom = top line = { 'position': [left, right, top, bottom], 'type': curve['type'] } if line: if 0 <= line['position'][0] <= line['position'][1] < size[0] and \ 0 <= line['position'][2] <= line['position'][3] < size[1]: curves.append(line) contents_dict[docid][pageid] = { 'texts': texts, 'size': size, 'tables': tables, 'others': others, 'curves': curves } return contents_dict
import jieba from jieba import posseg from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SentenceSplitter from corpus import cws_model, pos_model, parser_model, ner_model, news_path, relation_extract_output_file segmentor = Segmentor() segmentor.load(cws_model) postagger = Postagger() postagger.load(pos_model) parser = Parser() parser.load(parser_model) recognizer = NamedEntityRecognizer() recognizer.load(ner_model) sentencesplit = SentenceSplitter() def extract_start(input_file_name, output_file_name, begin_line, end_line): in_file = open(input_file_name, 'r', encoding='utf8') out_file = open(output_file_name, 'w') # for line in in_file.readlines()[begin_line:end_line]: # for sentence in sentencesplit.split(''.join(line.split()[:-1])): # fact_extract(sentence, out_file) fact_extract('欧几里得是西元前三世纪的希腊数学家。', out_file) in_file.close() out_file.close()
from pyltp import Segmentor,Postagger from Auth.common import * CWS_MODEL_PATH='/home/hiro/ltp_data_v3.4.0/cws.model' POS_MODEL_PATH='/home/hiro/ltp_data_v3.4.0/pos.model' segmentor = Segmentor() segmentor.load(CWS_MODEL_PATH) postagger = Postagger() postagger.load(POS_MODEL_PATH) def get_real_words_hit(str1): words=segmentor.segment(str1) postags = postagger.postag(words) wordlist=[(words[i],postags[i]) for i in range(0,len(words))] punc = open("punctuation.txt", 'rb') pr = punc.read() pr = pr.decode('gbk') p = pr.split() lreal = [] passlist = ['\\', "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", '/', '-', '$', '#'] for word in wordlist: if word[0] in passlist: pass elif word[1] in ['n', 'v', 'a'] and word[0] not in p: lreal.append(word[0])
class pnn_count(): def __init__(self): self.mydict = {} self.lines = [] self.lines_num = 3000 self.c = [0,0,0] #PNN self.w_c = [{},{},{}] self.segmentor = Segmentor() self.segmentor.load('cws.model') self.read_file() self.train() self.test() def read_file(self): f = open('pnn_annotated.txt','r') self.lines = f.readlines() f.close() def train(self): for i in range(0,self.lines_num/5*4): line = self.lines[i] line.strip('\n') line_array = line.split('\t') line = line_array[1] words = self.segmentor.segment(line) if line_array[0] == '1': pos = 0 elif line_array[0] =='0': pos = 1 else: pos = 2 for i in words: #calculate frequency if self.w_c[pos].has_key(i): self.w_c[pos][i] += 1 else: for a in range(0,3): self.w_c[a][i] = 0 self.w_c[pos][i] += 1 self.c[pos] += 1 def test(self): count = 0 v = len(self.mydict.keys()) for a in range(self.lines_num / 5 * 4, len(self.lines)-1): wholeline = self.lines[a] print wholeline result = [0.0,0.0,0.0] line_array = wholeline.split('\t') line = line_array[1] words = self.segmentor.segment(line) for i in range(0,3): pci = 1.0 * self.c[i] / (self.lines_num/5 *4) pwci = 1.0 sum_i = 0 for q in self.w_c[i].keys(): sum_i += self.w_c[i][q] for k in words: if self.w_c[i].has_key(k): pwci = pwci * (self.w_c[i][k] + 1) / (sum_i + v) result[i] = pci * pwci maxi = 0 for i in range(0,3): if result[i]>result[maxi]: maxi = i if maxi ==0: if line_array[0] == '1': count += 1 print "my guess is positive" elif maxi==1: if line_array[0] == '0': count += 1 print "my guess is neuter" else: if line_array[0] == '-1': count += 1 print "my guess is negative" print count * 1.0 /(self.lines_num/5)
def ws_data(self): f = open("pnn_annotated.txt", 'r') total_line = 0 orgin_attr = [0, 0, 0] judge_attr = [0, 0, 0] right = [0, 0, 0] segmentor = Segmentor() segmentor.load("cws.model") for line in f: total_line += 1 # print 'line has been read' value_num = [0, 0] result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print 'this line is %s' % (line) for i in ws_lst: classify = '' try: value = self.setiment_words[i] except: pass else: if value == 1: print 'positive word:%s' % i value_num[0] += 1 elif value == -1: print 'negative word:%s' % i value_num[1] += 1 if value_num[0] == 0 and value_num[1] == 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] == value_num[1] != 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] > value_num[1]: classify = 'positive' judge_attr[1] += 1 else: classify = 'negative' judge_attr[2] += 1 print value_num print 'classfiy result:%s' % classify # the count of original'emotion if result[0] == '0': orgin_attr[0] += 1 elif result[0] == '1': orgin_attr[1] += 1 else: orgin_attr[2] += 1 if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0): # print 'neutral' right[0] += 1 elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0): # print 'neutral' right[0] += 1 elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0): # print 'positive' right[1] += 1 elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0): # print 'negative' right[2] += 1 # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) print 'orgin\'s neutral, positive, negative' print orgin_attr print 'judge_attr neutral, positive, negative' print judge_attr print 'neutral, positive, negative' print right print (right[0] + right[1] + right[2]) print 'total_line %f\n' % total_line print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) segmentor.release()
# coding:utf-8 import math import copy from read_baselabel import read_baselabel from read_word2vec import read_word2vec import sys, os from operator import itemgetter from pyltp import Segmentor from collections import Counter segmentor = Segmentor() segmentor.load("/data0/shenyanjun/ltp_data/cws.model") path = os.path.abspath(os.path.dirname(sys.argv[0])) path_rule_for_stock = path + "/stock_to_theme.txt" path_base_label = path + "/stock_list_result.txt" path_word2vec = path + "/word2vec_item_only.txt" base_label = read_baselabel(path_base_label) base_label_dic, stock_names = base_label.transpose() word2vec = read_word2vec(path_word2vec) word2vec_dic = word2vec.read_w2v() def makeDict(path1): # 将规则存入dict,key为股票,value为股票可出现的概念 dict = {} fin = open(path1, "r") for line in fin: line1 = line.strip().split("\t")
def pyltp_cut(sentence): segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(sentence) # 分词 segmentor.release() # 释放模型 return words
print p_name if __name__ == "__main__" : argp = argparse.ArgumentParser(description="Online Classification System") argp.add_argument("-c" , "--classname" , choices=[classname.JUNK , classname.SENSITIVE] , required=True , help="Classification Type , junk-class or sensitive class ") argp.add_argument("-s" , "--sample_interval_mode" , choices=[sampleIntervalMode.LINE_MODE , sampleIntervalMode.CRLF_MODE] , default=sampleIntervalMode.LINE_MODE , help="The mode with describes what is the inverval symbol between samples , default is LINE_MODE") argp.add_argument("-i" , "--input" , type=str , default="stdin" , help="'sysin' for using standard input ; else file path is needed.") args = argp.parse_args() logging.info("loadding segmentor") segmentor = Segmentor() segmentor.load(CWS_MODEL_PATH) logging.info("done") # loading model if args.classname == classname.JUNK : model = TFIDFModel() model.load_model(JUNK_MODEL_PATH) else : model = BOOLModel() model.load_model(SENSITIVE_MODEL_PATH) #process the input file if args.input == "stdin" : ifo = sys.stdin else : ifo = open(args.input) # if error , just quit
class LtpParser(): def __init__(self, ltp_model_dir): LTP_DIR = ltp_model_dir self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) '''ltp基本操作''' def basic_parser(self, words): postags = list(self.postagger.postag(words)) netags = self.recognizer.recognize(words, postags) return postags, netags '''ltp获取词性''' def get_postag(self, words): return list(self.postagger.postag(words)) '''基于实体识别结果,整理输出实体列表''' def format_entity(self, words, netags, postags): name_entity_dist = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word + '_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word + '_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh') name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni') name_entity_dist['nss'] = self.modify_entity(place_entity_list, words, postags, 'ns') return name_entity_dist '''entity修正,为rebuild_wordspostags做准备''' def modify_entity(self, entity_list, words, postags, tag): entity_modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['stat_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [ words[int(start_index)] + '/' + postags[int(start_index)] ] else: consist = [ words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index) + 1) ] entity_dict['consist'] = consist entity_dict['name'] = ''.join( tmp.split('_')[0] for tmp in subs) + '/' + tag entity_modify.append(entity_dict) return entity_modify '''基于命名实体识别,修正words,postags''' def rebuild_wordspostags(self, name_entity_dist, words, postags): pre = ' '.join( [item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [ word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0] ] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''依存关系格式化''' def syntax_parser(self, words, postags): arcs = self.parser.parse(words, postags) words = ['Root'] + words postags = ['w'] + postags tuples = list() for index in range(len(words) - 1): arc_index = arcs[index].head arc_relation = arcs[index].relation tuples.append([ index + 1, words[index + 1], postags[index + 1], words[arc_index], postags[arc_index], arc_index, arc_relation ]) return tuples '''为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, tuples): child_dict_list = list() for index, word in enumerate(words): child_dict = dict() for arc in tuples: if arc[3] == word: if arc[-1] in child_dict: child_dict[arc[-1]].append(arc) else: child_dict[arc[-1]] = [] child_dict[arc[-1]].append(arc) child_dict_list.append([word, postags[index], index, child_dict]) return child_dict_list '''parser主函数''' def parser_main(self, words, postags): tuples = self.syntax_parser(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, tuples) return tuples, child_dict_list '''基础语言分析''' def basic_process(self, sentence): words = list(self.segmentor.segment(sentence)) postags, netags = self.basic_parser(words) name_entity_dist = self.format_entity(words, netags, postags) words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags) return words, postags
# -*- coding: utf-8 -*- # 作者:MebiuW # 微博:@MebiuW # python 版本:2.7 # 时间 2016/9/10 from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer segmentor1 = Segmentor() # 初始化实例 segmentor1.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/cws.model') # 加载模型 postagger = Postagger() # 初始化实例 postagger.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/pos.model') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/ner.model') # 加载模型 # 分词 def segmentor(sentence): words = segmentor1.segment(sentence) # 分词 # 默认可以这样输出 print '\t'.join(words) # 可以转换成List 输出 words_list = list(words) segmentor1.release() # 释放模型 return words_list
def __init__(self): """ init method required. set batch_size, and load some resources. """ self.batch_size = 128 FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string("ckpt_dir", "./checkpoint_cgrus/checkpoint/", "checkpoint location for the model") tf.app.flags.DEFINE_string("vocab_word_path", "predictor/word_freq.txt", "path of word vocabulary.") tf.app.flags.DEFINE_string("accusation_label_path", "predictor/accu.txt", "path of accusation labels.") tf.app.flags.DEFINE_string("article_label_path", "predictor/law.txt", "path of law labels.") tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate") tf.app.flags.DEFINE_integer( "decay_steps", 1000, "how many steps before decay learning rate.") tf.app.flags.DEFINE_float("decay_rate", 1.0, "Rate of decay for learning rate.") tf.app.flags.DEFINE_integer("sentence_len", 400, "max sentence length") tf.app.flags.DEFINE_integer("num_sentences", 16, "number of sentences") tf.app.flags.DEFINE_integer("embed_size", 64, "embedding size") #64 tf.app.flags.DEFINE_integer("hidden_size", 128, "hidden size") #128 tf.app.flags.DEFINE_integer( "num_filters", 128, "number of filter for a filter map used in CNN.") #128 tf.app.flags.DEFINE_integer("embed_size_dpcnn", 64, "embedding size") tf.app.flags.DEFINE_integer("hidden_size_dpcnn", 128, "hidden size") #tf.app.flags.DEFINE_integer("num_filters_big", 128, "number of filter for a filter map used in CNN.") tf.app.flags.DEFINE_string( "model_dpcnn", "dp_cnn", "name of model:han,c_gru,c_gru2,gru,text_cnn") tf.app.flags.DEFINE_string("ckpt_dir_dpcnn", "predictor/checkpoint_dpcnn_big32/", "checkpoint location for the model") tf.app.flags.DEFINE_boolean( "is_training", False, "is traning.true:tranining,false:testing/inference") tf.app.flags.DEFINE_string( "model", "c_gru", "name of model:han,c_gru,c_gru2,gru,text_cnn") #tf.app.flags.DEFINE_boolean("is_training_flag", False, "is traning.true:tranining,false:testing/inference") tf.app.flags.DEFINE_string('cws_model_path', 'predictor/cws.model', 'cws.model path') tf.app.flags.DEFINE_string('pos_model_path', 'predictor/pos.model', 'pos.model path') tf.app.flags.DEFINE_string('ner_model_path', 'predictor/ner.model', 'ner.model path') tf.app.flags.DEFINE_string('gpu', '1', 'help to select gpu divice') os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu segm = Segmentor() segm.load(FLAGS.cws_model_path) # ltp 模型 post = Postagger() post.load(FLAGS.pos_model_path) recognizer = NamedEntityRecognizer() recognizer.load(FLAGS.ner_model_path) self.ltp_model = [segm, post, recognizer] filter_sizes = [2, 3, 4, 5 ] #,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10] # [30,40,50] #8 #filter_sizes_big= [2,3,4,5]#,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10] # [30,40,50] #8 stride_length = 1 #1.load label dict, restore model from checkpoint # 1.load label dict self.vocab_word2index = load_word_vocab(FLAGS.vocab_word_path) accusation_label2index = load_label_dict_accu( FLAGS.accusation_label_path) articles_label2index = load_label_dict_article( FLAGS.article_label_path) deathpenalty_label2index = {True: 1, False: 0} lifeimprisonment_label2index = {True: 1, False: 0} vocab_size = len(self.vocab_word2index) accusation_num_classes = len(accusation_label2index) article_num_classes = len(articles_label2index) deathpenalty_num_classes = len(deathpenalty_label2index) lifeimprisonment_num_classes = len(lifeimprisonment_label2index) # 2.restore checkpoint config = tf.ConfigProto() config.gpu_options.allow_growth = True graph = tf.Graph().as_default() with graph: self.model = HierarchicalAttention(accusation_num_classes, article_num_classes, deathpenalty_num_classes, lifeimprisonment_num_classes, FLAGS.learning_rate, self.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences, vocab_size, FLAGS.embed_size, FLAGS.hidden_size, num_filters=FLAGS.num_filters, model=FLAGS.model, filter_sizes=filter_sizes, stride_length=stride_length) saver_accu = tf.train.Saver() sess_accu = tf.Session(config=config) saver_accu.restore(sess_accu, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) self.sess = sess_accu # graph_big = tf.Graph().as_default() # with graph_big: # self.model_dpcnn = HierarchicalAttention(accusation_num_classes, article_num_classes, deathpenalty_num_classes,lifeimprisonment_num_classes, # FLAGS.learning_rate, self.batch_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences,vocab_size, # FLAGS.embed_size_dpcnn, FLAGS.hidden_size_dpcnn,num_filters = FLAGS.num_filters, model = FLAGS.model_dpcnn, filter_sizes = filter_sizes, # stride_length = stride_length) # saver_big = tf.train.Saver() # sess_big = tf.Session(config=config) # saver_big.restore(sess_big, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) # self.sess_big=sess_big self.FLAGS = FLAGS
def bayes(self): segmentor = Segmentor() segmentor.load("cws.model") f = open('data/a_4.txt', 'r') # f = open('pnn_annotated.txt', 'r') # neutral, positive, negative class_freq = [0,0,0] # neutral, positive, negative word_total_count_freq = [0, 0, 0] each_word_count = [{}, {}, {}] accu = [0, 0] print 'train_set' for line in f: result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print line # neutral if result[0] == '0': class_freq[0] += 1 for word in ws_lst: word_total_count_freq[0] += 1 if each_word_count[0].get(word) is not None: # print 'Not none' each_word_count[0][word] += 1 else: # print 'None' each_word_count[0][word] = 1 # positive elif result[0] == '1': class_freq[1] += 1 for word in ws_lst: word_total_count_freq[1] += 1 if each_word_count[1].get(word) is not None: # print 'Not none' each_word_count[1][word] += 1 else: # print 'None' each_word_count[1][word] = 1 # negative elif result[0] == '-1': class_freq[2] += 1 for word in ws_lst: word_total_count_freq[2] += 1 if each_word_count[2].get(word) is not None: # print 'Not none' each_word_count[2][word] += 1 else: # print 'None' each_word_count[2][word] = 1 # print class_freq # print word_total_count_freq # print each_word_count print 'total' total_class_count = class_freq[0] + class_freq[1] + class_freq[2] total_word_count = word_total_count_freq[0] + word_total_count_freq[1] + word_total_count_freq[2] print total_class_count # print total_word_count f.close() f1 = open('a_1.txt', 'r') # 中性 积极, , 消极 # neutral, positive, negative orgin = [0, 0, 0] # 本来有多少积极消极 judge = [0, 0, 0] # 判断出来了多少积极消极 judge_right = [0, 0, 0] print 'test_set_now' for line in f1: result = line.split('\t') # print result[1] ws_lst = segmentor.segment(result[1]) # print test_line[test_count] max = 0 tmp_result = 0 for test_iter in range(3): processed_wst = [] prob_this_class = 1 for test_word in ws_lst: if test_word not in processed_wst: prob_this_class *= (each_word_count[test_iter].get(test_word, 0) + 1.0) / float(word_total_count_freq[test_iter] + total_word_count) processed_wst.append(test_word) prob_this_class *= (float(class_freq[test_iter]) / float(total_class_count)) if prob_this_class > max: max = prob_this_class tmp_result = test_iter if tmp_result == 0: test_result = '0' judge[0] += 1 elif tmp_result == 1: test_result = '1' judge[1] += 1 elif tmp_result == 2: test_result = '-1' judge[2] += 1 if result[0] == test_result: accu[0] += 1 else: accu[1] += 1 if result[0] == '0': orgin[0] += 1 elif result[0] == '1': orgin[1] += 1 elif result[0] == '-1': orgin[2] += 1 if result[0] == '0' == test_result: judge_right[0] += 1 elif result[0] == '1' == test_result: judge_right[1] += 1 elif result[0] == '-1' == test_result: judge_right[2] += 1 # print 'result is %s'%test_result # print 'count are %d, %d'%(accu[0], accu[1]) # print 'accuracy so far: %f'%(float(accu[0]) / float(accu[0] + accu[1])) f1.close() print 'orgin' print orgin print 'judge' print judge print 'judge_right' print judge_right print 'total' print accu print 'accuracy this time is %f'%((float(accu[0]) / float(accu[0] + accu[1])))
#!/usr/bin/env python # coding: utf-8 from pyltp import Segmentor segmentor = Segmentor() segmentor.load('/downloads/cws.model') def segment(text): if isinstance(text, unicode): text = text.encode('utf-8') words = segmentor.segment(text) return map(lambda x: x.decode('utf-8'), words)
class PreProcessor(object) : def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) : self.raw_data = None self.processed_data = None self.words_dict = None self.STOP_WORDS = self._load_stop_words(stop_words_dir) self.segmentor = Segmentor() self.segmentor.load(cws_model_path) def _load_stop_words(self , dir_name) : stop_words = set() cur_abs_dir_path = os.path.split(os.path.abspath(__file__))[0] dir_path = os.path.join(cur_abs_dir_path , dir_name) for file_name in os.listdir(dir_path) : file_path = os.path.join(dir_path , file_name) with open(file_path) as f : for line in f : word = line.strip() stop_words.add(word) for symbol in SENT_SPLIT_SYMBOLS : stop_words.add(symbol) return stop_words def load_raw_data(self , path) : with open(path) as f : self.raw_data = json.load(f) def _split_sentence(self , content) : ''' split content to sentence ''' sents = [] paras = content.split("\n") for paragraph in paras : split_rst = re.split(ur"[%s]+" %(SENT_SPLIT_SYMBOLS) , paragraph) # has space sents.extend(split_rst) return sents def _segment(self , unicode_line) : ''' return : list of words ''' utf8_line = unicode_line.strip().encode("utf8") words = list(self.segmentor.segment(utf8_line)) return words def _make_doc_data(self , url , title_seged , sents_seged) : return { 'url' : url , 'title' : title_seged , 'content' : sents_seged } def _add_word2words_dict(self , words) : for word in words : if word not in self.STOP_WORDS : word = word.lower() self.words_dict.add(word) def do_preprocessing(self) : logging.info("do preprocessing ...") self.processed_data = dict() self.words_dict = set() for page_id , page_data in self.raw_data.items() : url = page_data['url'] title = page_data["title"] content = page_data["content"] sents = self._split_sentence(content) # segment title_words = self._segment(title) content_words = [] for sent in sents : content_words.extend(self._segment(sent)) content_words.append(" ") # another space to avoid that they become one line when merging at output snippet self.processed_data[page_id] = self._make_doc_data(url , title_words , content_words) self._add_word2words_dict(title_words + content_words) logging.info('done.') def save_doc_data(self , to_path) : logging.info("saving doc data to ` %s `" %(to_path) ) with open(to_path , 'w') as of: json.dump(self.processed_data , of ) logging.info("done.") def save_words_dict(self , to_path) : logging.info("saving words dict to ` %s `" %(to_path)) words_list = list(self.words_dict) words_dict = {word : word_id for word_id , word in enumerate(words_list) } with open(to_path , 'w') as of : json.dump(words_dict , of , ensure_ascii=False) # json not support `set` logging.info("done.")
degree_dict = {} negative_dict = {} for i in range(len(f_degree)): word = f_degree[i].strip().split('\001')[0] if i < 13: degree_dict[word] = 2 if i >= 13 and i < 29: degree_dict[word] = 1 if i >= 29: degree_dict[word] = 3 for line in f_negative: word = line.strip().split('\001')[0] negative_dict[word] = -1 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) def parse(words_tags): #words = segmentor.segment(sentence) #words="清洁 不是 很 彻底 感觉 不是 正品".split() #print words #postags = postagger.postag(words) words = words_tags[0] postags = words_tags[1] mp = {} for index, i in enumerate(postags, 1):
class Sentence_Parser: def __init__(self): LTP_DIR = 'F:\project support\ltp_data_v3.4.0' # 分词 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, 'cws.model')) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, 'pos.model')) # 依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, 'parser.model')) # 命名实体识别(人名、地名、机构名等) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, 'ner.model')) # 词义角色标注(施事、受事、时间、地点) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) def format_labelrole(self, words, postags): """ 词义角色标注 """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } # for item in roles_dict.items(): # print(item) return roles_dict def bulid_parser_child_dict(self, words, postags, arcs): """ 句法分析---为句子中的每个词语维护一个保存句法依存子节点的字典 """ child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation not in child_dict: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # print(rely_id) relation = [arc.relation for arc in arcs] # for i in range(len(relation)): # print(words[i], '_', postags[i], '_', i, '_', relation[i]) heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # print(heads) for i in range(len(words)): a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list def parser_main(self, sentence): """ parser主函数 """ words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.bulid_parser_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list def select(self, words, postags): """ 筛选出名词和形容词 """ co_model = Word2Vec.load('coseg_text.model') n_list0 = [] a_list = [] for i in range(len(postags)): if postags[i] == 'n': if len(words[i]) >= 2: n_list0.append(words[i]) if postags[i] == 'a': # if len(words[i]) >= 2: a_list.append(words[i]) n_list0 = list(set(n_list0)) a_list = list(set(a_list)) # print(n_list0) # print(a_list) si_p = [] for n in n_list0: try: s = co_model.similarity(n, '手机') si_p.append(s) except Exception as e: si_p.append(0) index_list = list( map(si_p.index, heapq.nlargest(int(0.8 * len(si_p)), si_p))) #取出和手机相关度最高的n n_list = [] for index in index_list: n_list.append(n_list0[index]) # print(n_list) return n_list, a_list def simlarity(self, n_list0, a_list): """ 计算相似度,进行正逆向匹配,筛选出名词和形容词的最佳搭配 """ n_list0 = n_list0 a_list = a_list co_model = Word2Vec.load('coseg_text.model') si_p = [] for n in n_list0: try: s = co_model.similarity(n, '手机') si_p.append(s) except Exception as e: si_p.append(0) index_list = list( map(si_p.index, heapq.nlargest(int(0.8 * len(si_p)), si_p))) #取出和手机相关度最高的n n_list = [] for index in index_list: n_list.append(n_list0[index]) # 名词正向匹配 comment1_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)]) index = 0 for i in range(len(n_list)): f_si = 0 for j in range(len(a_list)): try: si = co_model.similarity(n_list[i], a_list[j]) if si >= f_si: f_si = si comment_tag = n_list[i] + a_list[j] else: f_si = f_si except Exception as e: print('语料库中缺少该词', e) comment1_df.loc[index, ] = [comment_tag, f_si] index += 1 comment1_df = comment1_df.sort_values(by='similarity', ascending=False, ignore_index=True) comment1_df.dropna(subset=['comment_tag'], inplace=True) # comment1_df = comment1_df.iloc[0: int(0.2*len(comment_df)), ] # 形容词匹配逆向匹配 comment2_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)]) index = 0 for i in range(len(a_list)): f_si = 0 for j in range(len(n_list)): try: si = co_model.similarity(n_list[j], a_list[i]) if si >= f_si: f_si = si comment_tag = n_list[j] + a_list[i] else: f_si = f_si except Exception as e: print('语料库中缺少该词', e) comment2_df.loc[index, ] = [comment_tag, f_si] index += 1 comment2_df = comment2_df.sort_values(by='similarity', ascending=False, ignore_index=True) comment1_df.dropna(subset=['comment_tag'], inplace=True) comment_df = pd.merge(comment1_df, comment2_df, on='comment_tag', how='inner') comment_df.dropna(subset=['comment_tag'], inplace=True) return comment_df def cleandata(self, x): """ 对数据进行清洗,替换一些不规则的标点符号 """ pat = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]") # 只保留中英文,去掉符号 x = x.replace(' ', ',') emoji.demojize(x) # 去掉表情表情符号 x = re.sub(pat, ',', x) return x
if view_word in sentence: result = abstract(view_word, sentence) if result != None: news_list[num]['opinion'].append(result) # print(news_list[num]) print('执行完成') if __name__ == '__main__': with open(r'D:\Github_project\Project_one\算法模型\data\view_words.pk', 'rb') as f: view_words = pickle.load(f) # 加载模型 # # 哈工大ltp分词 segmentor = Segmentor() # 初始化实例 segmentor.load('D:\data\ltp_data_v3.4.0\cws.model') # 加载模型 # 得到词性 postagger = Postagger() # 初始化实例 postagger.load('D:\data\ltp_data_v3.4.0\pos.model') # 加载模型 # 得到命名实体 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(r'D:\data\ltp_data_v3.4.0\ner.model') # 加载模型 # 依存句法分析 parser = Parser() # 初始化实例 parser.load(r'D:\data\ltp_data_v3.4.0\parser.model') # 加载模型 # 得到语义角色标注 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(r'D:\data\ltp_data_v3.4.0\pisrl_win.model') # 加载模型 # with open(r'D:\Github_project\Project_one\算法模型\data\news_sports.pk', 'rb') as f: # news_sports = pickle.load(f)
# -*- coding: utf-8 -*- from pyltp import Segmentor segmentor = Segmentor() segmentor.load("/Users/lzy/Code/ltp_model/cws.model") def word_seg(line,label="0"): words = segmentor.segment(line) s=" ".join(words) return s
Created on 2018年12月25日 @author: Zhukun Luo Jiangxi university of finance and economics ''' import os import pandas as pd import re from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller from pyltp import SentenceSplitter LTP_DIR = r'D:\LTP\MODEL\ltp_data' # ltp模型目录的路径 segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 分词模型路径,模型名称为`cws.model` postagger = Postagger() postagger.load(os.path.join(LTP_DIR, "pos.model")) # 词性标注模型路径,模型名称为`pos.model` parser = Parser() parser.load(os.path.join(LTP_DIR, "parser.model")) # 依存句法分析模型路径,模型名称为`parser.model recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(LTP_DIR, "ner.model")) # 命名实体识别模型路径,模型名称为`ner.model` # labeller = SementicRoleLabeller() # labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))# 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
# kernel method for select term import re import random import os from Config import * from pyltp import Segmentor segmentor = Segmentor() segmentor.load('./model/cws.model') def answer(questionDict): ### return answer for select term question ''' for each in questionDict: print each,questionDict[each] ''' candidateTermList = generate_candidate_term(questionDict['options']) compareSentenceList = generate_compare_sentence(questionDict['body'],candidateTermList) scoreList = rnnlm_score(compareSentenceList) answer = find_best_option(questionDict['options'],candidateTermList,scoreList) #print 'answer',answer return answer def generate_candidate_term(optionList): ### generatee candidata term ### return candidate term list. [[A1,A2],[B1,B2],[C1,C2]] optionList = option_list_regular(optionList) candidateTermList = [] # insert empty list for sentence length for i in range(len(optionList[0])):
class LTP(object): def __init__(self): cws_model_path = os.path.join('../data/ltp_data_v3.4.0', 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join('../data/ltp_data_v3.4.0', 'pos.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join( '../data/ltp_data_v3.4.0', 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 # 分词 def segment(self, text): words = list(self.segmentor.segment(text)) return words # 词性标注 def postag(self, words): postags = list(self.postagger.postag(words)) return postags # 获取文本中的时间 def get_time(self, text): # 开始分词及词性标注 words = self.segment(text) #print(words) postags = self.postag(words) #print(postags) time_lst = [] i = 0 for tag, word in zip(postags, words): if tag == 'nt': j = i while postags[j] == 'nt' or words[j] in ['至', '到']: j += 1 time_lst.append(''.join(words[i:j])) i += 1 # 去重子字符串的情形 remove_lst = [] for i in time_lst: for j in time_lst: if i != j and i in j: remove_lst.append(i) text_time_lst = [] for item in time_lst: if item not in remove_lst: text_time_lst.append(item) # print(text_time_lst) return text_time_lst #提取人名地名组织名 def get_name(self, text): persons, places, orgs = set(), set(), set() words = self.segment(text) #print("words333333333333") postags = self.postag(words) #print(postags) netags = list(self.recognizer.recognize(words, postags)) # 命名实体识别 #print(netags) # print(netags) i = 0 for tag, word in zip(netags, words): j = i # 人名 if 'Nh' in tag: if str(tag).startswith('S'): persons.add(word) elif str(tag).startswith('B'): union_person = word while netags[j] != 'E-Nh': j += 1 if j < len(words): union_person += words[j] persons.add(union_person) # 地名 if 'Ns' in tag: if str(tag).startswith('S'): places.add(word) elif str(tag).startswith('B'): union_place = word while netags[j] != 'E-Ns': j += 1 if j < len(words): union_place += words[j] places.add(union_place) # 机构名 if 'Ni' in tag: if str(tag).startswith('S'): orgs.add(word) elif str(tag).startswith('B'): union_org = word while netags[j] != 'E-Ni': j += 1 if j < len(words): union_org += words[j] orgs.add(union_org) i += 1 # print('人名:', ','.join(persons)) # print('地名:', ','.join(places)) # print('组织机构:', ','.join(orgs)) return persons, places, orgs # 释放模型 def free_ltp(self): self.segmentor.release() self.postagger.release()