def post():#登陆主页 f = request.files['file'] input_text = request.form['input_txt'].encode("utf-8-sig") filename = f.filename uname = session['username'] if request.form['click'] == '上传文件': #record user ip , date, user_name if filename: curr_time = datetime.datetime.now() if '.txt' in filename: content=f.read().decode(encoding="utf-8-sig") sentences=[sentence for sentence in SentenceSplitter.split(content) if sentence] # Id=uni_id() for i in sentences: sentences_collection.insert({'username':uname,'filename':filename.replace('.txt',''),'sentence':i,'uploadTime':curr_time,'casualtag':'0','eventtag':'0','cflag':'0','eflag':'0'}) return render_template("upload_succeed.html",suc_message='upload successful') else: err_message='text format not right!' return render_template("upload_error.html",err_message=err_message) else: curr_time = datetime.datetime.now() content=input_text sentences=[sentence for sentence in SentenceSplitter.split(content) if sentence] for i in sentences: sentences_collection.insert({'username':uname,'filename':'输入上传','sentence':i,'uploadTime':curr_time,'casualtag':'0','eventtag':'0','cflag':'0','eflag':'0'}) return render_template("upload_succeed.html",suc_message='upload successful')
def collect_infos(word): infos = hudong.info_extract_hudong(word) for info in infos: intro_sents = [ sent for sent in SentenceSplitter.split(info['intro']) if len(sent) > 0 ] desc_sents = [ sent for sent in SentenceSplitter.split(info['desc']) if len(sent) > 0 ] print(intro_sents) print('****' * 5) print(desc_sents)
def get_sentence_list(self): """ 获取分句 :return: """ sents = SentenceSplitter.split(self.sentence) return [s for s in sents if s]
def main(): file_list = [] path = r'C:/Users/JeremySun/Desktop/Internship/Project02_corpusProcessor/app' file_path = batch_file(path=path, file_list=file_list) for path in file_path: app = open(path, encoding='utf-8').readlines() assetPath_text = get_assetPath(text=app) assetPath_loss_html = loss_html(text=assetPath_text) assetPath_loss_label = loss_label(text=assetPath_loss_html) assetPath_loss_mail = loss_mail(text=assetPath_loss_label) assetPath_loss_other = loss_other(text=assetPath_loss_mail) assetPath_loss_url = loss_url(text=assetPath_loss_other) assetPath_clean_url = clean_url(text=assetPath_loss_url) assetPath_loss_continue = loss_continue(text=assetPath_clean_url) assetPath_loss_word = loss_word(text=assetPath_loss_continue) assetPath_loss_comma = loss_comma(text=assetPath_loss_word) # 分句 assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma) # 去掉其余符号并写入文件 pattern_all = re.compile(r"[。.;;::??!!]") # 加:: f = open("app_pre.txt", 'a', encoding='utf-8') for i in tqdm(assetPath_sentence): i = re.sub(pattern=pattern_all, repl='', string=i) f.write(i + '\n') f.close()
def split_sentences(text): ''' params: text 文本,包含多句话的文本 划分句子 ''' return SentenceSplitter.split(text)
def sentence_splitter(): """ 分句 """ sentence = '你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。' sents = SentenceSplitter.split(sentence) # 分句 print("\n".join(sents))
def get_sentences(self, news): """ 分句 :param news: str 新闻文本 :return: list 句子列表 """ return list(SentenceSplitter.split(news))
def __handel(self, news): news = TextHandle.cht_to_chs(news) sentences = [] for line in news.strip().split('\n'): sentences += list(SentenceSplitter.split(line.strip())) # sentences += [line.strip()] return sentences
def pad_batch(batch_docs, TEXT): res_batch_docs = [] max_words, max_sents = 0, 0 res_batch_targets = [] for doc in batch_docs: doc_text = doc[2] res_doc_text = [] # 使用LTP将一篇文章划分成若干句子 sents = SentenceSplitter.split(doc_text) max_sents = max(max_sents, len(sents)) for i, sent in enumerate(sents): sent = TEXT.preprocess(sent) sent = [TEXT.vocab.stoi[word] for word in sent] max_words = max(max_words, len(sent)) res_doc_text.append(sent) res_batch_docs.append(res_doc_text) res_batch_targets.append(doc[1]) for doc in res_batch_docs: sents = doc for sent in sents: while len(sent) < max_words: sent.append(0) while len(sents) < max_sents: sents.append([0 for _ in range(max_words)]) return torch.LongTensor(res_batch_docs), torch.LongTensor( res_batch_targets)
def sentence_split(read_file): """ 对段落中的句子进行基于符号的划分 :param read_file: 文件txt :return: 分好的句子存入到sequences了,所以只需要返回状态信息就好了 """ for paragraph in read_file.readlines(): # 太短的段落(词?)没有分的必要了 if paragraph == '' or len(paragraph) <= 4: continue sentence_splitter = SentenceSplitter.split(paragraph) for sequence in sentence_splitter: # 去除空行 if sequence == '': continue # 二次分隔 second_sentences = re.split('[,,]', sequence) for second_sentence in second_sentences: # 对于句子的筛选工作 second_sentence = deal_data(second_sentence) if second_sentence == '' or len(second_sentence) <= 4: continue sentences.add(second_sentence) str = "分句步骤已完成" print("=" * 10, str, "=" * 10) return str
def predict(self, question, text): input_question = self.sentence2input(question) sentences = list(SentenceSplitter.split(text)) input_answers_temp = [ self.sentence2input(sentence) for sentence in sentences ] input_answers_len = np.array([len(a) for a in input_answers_temp]) input_answers = tl.prepro.pad_sequences(input_answers_temp) input_questions = tl.prepro.pad_sequences([input_question] * len(sentences)) input_question_len = np.array([len(q) for q in input_questions]) simi_list = self.predict_sess.run(self.simi_P, feed_dict={ self.Ques: input_questions, self.Ques_seq_len: input_question_len, self.Pos_Ans: input_answers, self.Pos_Ans_len: input_answers_len, self.keep_prob: 1.0 }) # for sen,simi in zip(sentences,simi_list): # print simi, sen simi_list = np.array(simi_list) return sentences[np.argmax(simi_list)], max(simi_list)
def ltpSentenceSplit(content): """ ltp split sentence :param content: :return: """ return list(SentenceSplitter.split(content))
def Relation_Extraction(content): statements = SentenceSplitter.split(str(content)) relation = json.load(open(resource_path + 'all_relations.json')) all_triples = [] for statement in statements: # print(statement) words, postags, arcs = ltp_parser.parse(statement) triples = rule_based_extraction(words, postags, arcs) tri = [] for i in range(len(triples)): flag = 0 for j in range(len(tri)): if triples[i][0] == tri[j][0] and triples[i][1] == tri[j][ 1] and triples[i][2] == tri[j][2]: flag = 1 if triples[i][0] == triples[i][2]: flag = 1 if flag == 0: tri.append(triples[i]) flag1 = 0 if tri: # print('关系三元组:') for triple in tri: for i in relation: if (triple[1] == i): flag1 = 1 break if (flag1 == 1): all_triples.append(triple) return all_triples
def feature_about(): # 获取特征列表 feature_dict = NewsUtil.get_feature() # 获取新闻中出现特征后最近的5个词及其属性 logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, CFETSFX_LEXICON_PATH) # 加载模型,第二个参数是您的外部词典文件路径 feature_about_list = list() for rowN in range(0, raw_news_rows): news_content = raw_news_table.cell_value(rowN, 2) sentences = SentenceSplitter.split(news_content) for sentence in sentences: print(sentence) # 分词 words = segmentor.segment(sentence) print(list(words)) for word_index in range(0, len(words)): word = words[word_index] for feature_word in feature_dict.values(): if feature_word in word: about_list = list() count = 0 while word_index < len(words) and count < 6: about_list.append(words[word_index]) count += 1 word_index += 1 feature_about_list.append(about_list) print(about_list) break segmentor.release() CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def main(): file_list = [] path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_text_pre' file_path = batch_file(path=path, file_list=file_list) for path in file_path: english_text_connect = open(path, encoding='utf-8').readlines() assetPath_loss_url = loss_url(text=english_text_connect) assetPath_loss_img = loss_img(text=assetPath_loss_url) assetPath_loss_video = loss_video(text=assetPath_loss_img) assetPath_loss_src = loss_src(text=assetPath_loss_video) assetPath_loss_div = loss_div(text=assetPath_loss_src) assetPath_loss_span = loss_span(text=assetPath_loss_div) assetPath_loss_p = loss_p(text=assetPath_loss_span) assetPath_loss_special = loss_special(text=assetPath_loss_p) assetPath_loss_continue = loss_continue(text=assetPath_loss_special) assetPath_loss_word = loss_word(text=assetPath_loss_continue) assetPath_loss_chino = loss_chino(text=assetPath_loss_word) assetPath_loss_greek = loss_greek(text=assetPath_loss_chino) assetPath_loss_pinyin = loss_pinyin(text=assetPath_loss_greek) assetPath_loss_fake = loss_fake(text=assetPath_loss_pinyin) assetPath_loss_tradition = loss_tradition(text=assetPath_loss_fake) assetPath_loss_comma = loss_comma(text=assetPath_loss_tradition) # 分句 english_text_sentence = SentenceSplitter.split(assetPath_loss_comma) # 去掉其余符号并写入文件 pattern_all = re.compile(r"[。.;;??!!]") f = open("english_text_sentence_pre.txt", 'a', encoding='utf-8') for i in tqdm(english_text_sentence): i = re.sub(pattern=pattern_all, repl='', string=i) f.write(i + '\n') f.close()
def split2sent(self, text): ''' 对文本进行分句 ''' from pyltp import SentenceSplitter sents = SentenceSplitter.split(text) return sents
def article_preprocessing(article): """ 对News进行预处理,对文本进行分词和词性标注 这里的分词,没有去掉stopwords,也没有去掉标点符号 Args: Article: 原始的新闻Article Return: 预处理之后的新闻PreprocessArticle """ # print("article preprocessing") pre_article = PreprocessArticle() # pre_article.id = article.id pre_article.date = article.time pre_article.title = article.title pre_article.seg_pos_title = seg_pos.cut(article.title) sentences = SentenceSplitter.split(article.content) # Using ltp split sentence sentences = [x for x in sentences if x != ''] for idx, x in enumerate(sentences): sentence = Sentence() sentence.text = x sentence.location = idx sentence.seg_pos = seg_pos.cut(x) pre_article.sentences.append(sentence) return pre_article
def tokenized_sub_sents(row): content = row.iloc[1] sub_sents = [] sub_tags = [] sents = SentenceSplitter.split(content) for sent in sents: subs = [x for x in re.split(punc_pattern, sent) if x] subss = [ jieba.posseg.cut(x, HMM=False) for x in subs if not re.findall(no_chn, x) ] tags = [] subs = [] for s in subss: tag = [] sub = [] for t0, t1 in s: tag.append(t1) sub.append(t0) tags.append(tag) subs.append(sub) assert len(tags) == len(subs) sub_sents.extend(subs) sub_tags.extend(tags) # print(sub_sents, sub_tags) row["sub_sents_tokenized"] = sub_sents row["sub_sents_postagged"] = sub_tags return row
def mainLocaltion(self, dirName="西游记白话文"): txtlist = os.listdir(self.book_root_path + dirName) lo_list_book = [] for txt in txtlist: lo_list_chapter = [] print txt lines = self.readBookLines(self.book_root_path + dirName + "/" + txt) for line in lines: if line != "": sents = SentenceSplitter.split(line) for sent in sents: words_line = self.segmentor(sent) lo_list_line = self.posttagerNLNS(words_line) lo_list_chapter += lo_list_line # 统计每一章节top 10 top_itf_chapter, top_lo_chapter = self.getTopTen(lo_list_chapter) lo_list_book += top_lo_chapter self.writeTxt(self.mainlo_root_path + dirName + "/" + txt, top_itf_chapter) print txt + "本章节top 10----------------------" for cloname, clotimes, clofreq in top_itf_chapter: print cloname, clotimes, clofreq # 统计整本书 top 10 top_loitf_book, top_lo_book = self.getTopTen(lo_list_book) self.writeTxt(self.mainlo_root_path + dirName + "/AllChapter.txt", top_loitf_book) print "整本书 top 10----------------------" for bloname, blotimes, blofreq in top_loitf_book: print bloname, blotimes, blofreq
def mainName(self, dirName): txtlist = os.listdir(self.book_root_path + dirName) name_list_book = [] for txt in txtlist: name_list_chapter = [] print txt lines = self.readBookLines(self.book_root_path + dirName + "/" + txt) for line in lines: if line != "": sents = SentenceSplitter.split(line) for sent in sents: words_line = self.segmentor(sent) postags_line, name_list_line = self.posttaggerNH( words_line) name_list_chapter += name_list_line # 统计每一章节top 10 top_itf_chapter, top_name_chapter = self.getTopTen( name_list_chapter) # [(name,times,freq),()] name_list_book += top_name_chapter self.writeTxt(self.mainrole_root_path + dirName + "/" + txt, top_itf_chapter) print txt + "本章节top 10----------------------" for cname, ctimes, cfreq in top_itf_chapter: print cname, ctimes, cfreq # 统计整本书 top 10 top_itf_book, top_name_book = self.getTopTen(name_list_book) self.writeTxt(self.mainrole_root_path + dirName + "/AllChapter.txt", top_itf_book) print "整本书 top 10----------------------" for bname, btimes, bfreq in top_itf_book: print bname, btimes, bfreq
def sentence_split(paras_aft_retri, topic_to_search): assert len(paras_aft_retri) == len(topic_to_search) res = [] for i, item in enumerate(paras_aft_retri): para_to_split, urls, tops, query = item tops = topic_to_search[i][1] sent_aft_split = [] url_aft = [] for ind, para in enumerate(para_to_split): sents = SentenceSplitter.split(para) url = urls[ind] for sent in sents: if sent[-1] == '?' or sent[-1] == '?': continue elif "引用" in sent or "相关阅读" in sent or "特别声明" in sent or "声明" in sent or "来源" in sent \ or "原标题" in sent or "联系电话" in sent or "联系方式" in sent: continue elif len(sent) < 15: continue elif check_contain_chinese(sent) < 0.6: continue else: sent_aft_split.append(sent) url_aft.append(url) assert len(sent_aft_split) == len(url_aft) res.append((sent_aft_split, url_aft, tops, query)) # print(res) return res
def main(): file_list = [] path = r'D:\实习数据备份\备份\professional' file_path = batch_file(path=path, file_list=file_list) for path in file_path: profession_text_connect = open(path, encoding='utf-8').readlines() assetPath_loss_html = loss_html(text=profession_text_connect) assetPath_loss_label = loss_label(text=assetPath_loss_html) assetPath_loss_mail = loss_mail(text=assetPath_loss_label) assetPath_loss_other = loss_other(text=assetPath_loss_mail) assetPath_loss_url = loss_url(text=assetPath_loss_other) assetPath_clean_url = clean_url(text=assetPath_loss_url) assetPath_loss_continue = loss_continue(text=assetPath_clean_url) assetPath_loss_word = loss_word(text=assetPath_loss_continue) assetPath_loss_comma = loss_comma(text=assetPath_loss_word) # 分句 assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma) # 去掉其余符号并写入文件 pattern_all = re.compile(r"[。.;;::??!!/|]") # 加:: f = open("profession_pre.txt", 'a', encoding='utf-8') for i in tqdm(assetPath_sentence): i = re.sub(pattern=pattern_all, repl='', string=i) f.write(i + '\n') f.close()
def main(): file_list = [] path = r'C:\Users\JeremySun\Desktop\Internship\Project02_corpusProcessor\english_folder' file_path = batch_file(path=path, file_list=file_list) for path in file_path: english_text_connect = open(path, encoding='utf-8').readlines() assetPath_loss_html = loss_html(text=english_text_connect) assetPath_loss_label = loss_label(text=assetPath_loss_html) assetPath_loss_mail = loss_mail(text=assetPath_loss_label) assetPath_loss_other = loss_other(text=assetPath_loss_mail) assetPath_loss_url = loss_url(text=assetPath_loss_other) assetPath_clean_url = clean_url(text=assetPath_loss_url) assetPath_loss_continue = loss_continue(text=assetPath_clean_url) assetPath_loss_word = loss_word(text=assetPath_loss_continue) assetPath_loss_comma = loss_comma(text=assetPath_loss_word) # 分句 english_text_sentence = SentenceSplitter.split(assetPath_loss_comma) # 去掉其余符号并写入文件 pattern_all = re.compile(r"[。.;;??!!::]") # 加:: pattern_last = re.compile(r'[a-zA-Z0-9]{13,}') f = open("english_text_sent_pre.txt", 'a', encoding='utf-8') for i in tqdm(english_text_sentence): if len(i) <= 100: i = re.sub(pattern=pattern_all, repl=' ', string=i) i = re.sub(pattern=pattern_last, repl='', string=i) f.write(i.strip() + '\n') f.close()
def get_roles_emotions(self): ''' 获取self.book中的人物及情绪变化指标 :return: ''' contents = self.get_chapterList() roles = self.get_roles() role_points = [[] for i in range(len(roles))] for content in contents: sents = SentenceSplitter.split(content) sent_words = [] for sent in sents: words = list(self.segmentor.segment(sent)) for role in roles: if role in words: sent_words.append(words) break role_emotionWords = self.get_role_emotionWord(roles, sent_words) points = self.count_point(role_emotionWords, roles) for i in range(len(role_points)): role_points[i].append(points[i]) for term in range(role_points.__len__()): index = role_points[term].__len__() - 2 while index >= 1: if role_points[term][index] == 0 or role_points[term][ index] == 0.0: role_points[term][index] = ( role_points[term][index - 1] + role_points[term][index + 1]) / 2 index -= 1 for i in range(len(role_points)): print roles[i], role_points[i] return roles, role_points
def getDAN(self,path = "测试测试啊哈哈哈"): ''' 读取文本文件,获取danlist :return: ''' #txts = [] dan_list = [] files = os.listdir(self.book_root_path) for file in files: fileposition = self.book_root_path + "\\" + file print("file name:", fileposition) with open(fileposition,"r",encoding="utf-8")as f: lines = f.readlines() for line in lines : #txts.append(line) if line != "": sentences = SentenceSplitter.split(line) #print("sentences:", sentences) for sentence in sentences: words = self.segmentor(sentence) #print("sentences 11:",words) dan_list_line = self.postagger(words) dan_list += dan_list_line f.close() return list(set(dan_list)) #def getDAN(self): '''
def main(): file_list = [] path = r'D:\实习数据备份\备份\xinwenshuju' file_path = batch_file(path=path, file_list=file_list) for path in file_path: try: news_text_connect = open(path, encoding='utf-8').readlines() assetPath_loss_html = loss_html(text=news_text_connect) assetPath_loss_label = loss_label(text=assetPath_loss_html) assetPath_loss_mail = loss_mail(text=assetPath_loss_label) assetPath_loss_other = loss_other(text=assetPath_loss_mail) assetPath_loss_url = loss_url(text=assetPath_loss_other) assetPath_clean_url = clean_url(text=assetPath_loss_url) assetPath_loss_continue = loss_continue(text=assetPath_clean_url) assetPath_loss_word = loss_word(text=assetPath_loss_continue) assetPath_loss_comma = loss_comma(text=assetPath_loss_word) # 分句 assetPath_sentence = SentenceSplitter.split(assetPath_loss_comma) # 去掉其余符号并写入文件 pattern_all = re.compile(r"[。.;;::??!!|]") # 加:: f = open("news_pre.txt", 'a', encoding='utf-8') for i in tqdm(assetPath_sentence): i = re.sub(pattern=pattern_all, repl='', string=i) f.write(i + '\n') f.close() except: print("utf-8 codec can not decode byte 0xc3 in position 0")
def get_sentences(content_sentences): sentences_list = list(SentenceSplitter.split(content_sentences)) # delete empty member in list sentences_list = [item.strip() for item in sentences_list if item] return sentences_list
def sentence_split(self, string): """ 分句 :param string: 传入字符串 :return: """ sentence = SentenceSplitter.split(string) return sentence
def split_sentence(sentence): # pattern = re.compile('[。;]') # split = pattern.sub('\r', sentence).split() # split sentence # return split sents = SentenceSplitter.split(sentence) result = [x for x in sents if x != ''] return result
def sentence_splitter(sentence): sents = SentenceSplitter.split(sentence) # 分句 sent='' for i in sents: i=re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+", "",str(i)) sent+=i+' ' #print(sent) return sent
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags)