def text_keyword_abstract(article, keywords_len, sentences_len): tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=2): keywords.append(item.word) keywords = ' '.join(keywords) tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): abstract.append(item.sentence) return keywords, abstract
def keyword(self): keyword1 = jieba.analyse.textrank(self.cut, topK=5, withWeight=True) keyword2 = jieba.analyse.extract_tags(self.cut, topK=5, withWeight=True) print('Key word for jieba_textrank:') for m in keyword1: print(m[0], m[1]) print('\nKey word for jieba_TF-IDF:') for n in keyword2: print(n[0], n[1]) print('\nKey word for textrank4zh:') keyword3 = TextRank4Keyword() keyword3.analyze(text=self.cut, lower=True, window=2) for i in keyword3.get_keywords(5): print(i.word, i.weight)
def key_word_extraction(articles): key_word = [] num_key_word = 5 tr4w = TextRank4Keyword() for i in range(len(articles)): temp_key_word = [] tr4w.analyze(text=articles[i], lower=True, window=3, vertex_source='all_filters', edge_source='all_filters') for j in range(num_key_word): temp_key_word.append( tr4w.get_keywords(num=num_key_word)[j]['word']) key_word.append(temp_key_word) return key_word
def getKeyWordsByTitle(max_num=10, startColum=None, endColum=None): # 每天最多的关键词个数 # max_num = 7 # pd显示所有行、列 pd.set_option('display.max_rows', None) # 行 pd.set_option('display.max_columns', None) # 列 # 加载需要的keyword keyword = pd.read_csv('../data/keyword_title.csv')['word'].values.tolist() # 加载title data_arr = pd.read_csv('../data/RenMinTitle_all_2020_04_06.csv').iloc[ startColum:endColum, :30].values # 每天化为1条,转为1维数据 data_list = [] for arr in data_arr: data_list.append(' '.join(arr)) print(data_list) # 生成日期列表 # date_list = netUtil.getDateList(len(data_list)) # 构建二维矩阵,按天存储关键词 keywords_allday_list = [] # 存储所有的关键词,用来构建连接矩阵 keywords_all_set = set() # 获取每天的关键词 for day_title in data_list: day_keywords_list = [] words = TextRank4Keyword() words.analyze(text=day_title, lower=True, window=3) keywords_dic_list = words.get_keywords(40, word_min_len=2) for one_dic in keywords_dic_list: # 下面注释是为了挑选出名词 # s_tags = SnowNLP(one_dic['word']).tags # print('---') # flag = 0 # for tag in s_tags: # if tag[1] == 'n': # flag = 1 # print(tag) # if one_dic['word'] in keyword: # if flag == 1: if one_dic['word'] in keyword: day_keywords_list.append(one_dic['word']) keywords_all_set.add(one_dic['word']) if len(day_keywords_list) >= max_num: break keywords_allday_list.append(day_keywords_list) return keywords_allday_list
def process(self, ): for i in self.text: self.article += i.getText() + '\n' self.article = self.article.strip() keywords = [] abstract = [] ##关键词 tr4w = TextRank4Keyword() tr4w.analyze(text=self.article, lower=True, window=2) for item in tr4w.get_keywords(4, word_min_len=1): keywords.append(item.word) ##摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=self.article, lower=True, source = 'all_filters') for item in tr4s.get_key_sentences(num=3): abstract.append(item.sentence) return keywords, abstract
def compute_texttank(df, filepath): writer = pd.ExcelWriter(filepath) for names in df: all_re = [] test_df = df[names] text_all = concat_all(df=test_df[:5]) # 原本test_df為了demo改為test_df[:5] tr4w = TextRank4Keyword() tr4w.analyze(text=text_all, lower=True, window=6) for item in tr4w.get_keywords(num=100, word_min_len=2): all_re.append([item.word, item.weight]) df_result = pd.DataFrame(all_re) df_result.columns = ['關鍵詞', 'Textrank分數'] df_result.to_excel(writer, sheet_name=names) writer.save()
def get_textRank_kp(file_name, topk): json_file = open(file_name, 'r', encoding='utf-8') textRank_kp = [] for line in json_file.readlines(): json_data = json.loads(line) cur_content = json_data['title'].strip().lower( ) + ' ' + json_data['abstract'].strip().lower() tr4w = TextRank4Keyword() tr4w.analyze(text=cur_content, lower=True, window=2) keywords_list = [] for item in tr4w.get_keywords(topk, word_min_len=1): keywords_list.append(item.word) kp_list = get_kp(cur_content, keywords_list) # textRank_kp = tr4w.get_keyphrases(keywords_num=20, min_occur_num=2) textRank_kp.append(kp_list) json_file.close() return textRank_kp
def getKeyword(text, keywords_num=20): """对text提取关键词,关键词个数为keywords_num, 关键短语也顺便提取。 关键短语在文中的最少出现次数为 phrase_min_num=5,没有再减。 返回关键字的列表(item.word, item.weight)和关键短语列表 """ tr4w = TextRank4Keyword() # 对文本进行分析,设定窗口大小为2,并将英文单词小写 tr4w.analyze(text=text, lower=True, window=2) kw = tr4w.get_keywords(num=keywords_num, word_min_len=1) mon = 5 while(mon>0): kp = tr4w.get_keyphrases(keywords_num=keywords_num, min_occur_num=mon) if len(kp) > 0: print(mon) break mon -= 1 return kw, kp
def get_words_list(path='../data/countryComment.csv'): tr4w = TextRank4Keyword() # 获取评论 comments = pd.read_csv(path)['评论内容'].values # 划分分词 word_list = [] for comment in comments: tr4w.analyze(text=comment) words = tr4w.words_all_filters words = sum(words, []) # 二维转一维 word_list.append(words) print(word_list) words = sum(word_list, []) words = pd.DataFrame(words) words.to_csv('../data/words.csv', index=False) print('words.csv文件保存成功!') return words
def generator(self,text): tr4w = TextRank4Keyword(stop_words_file='/home/ubuntu/HITChat/stopword.txt') # 导入停止词 #使用词性过滤,文本小,窗口为3 tr4w.train(text=text, speech_tag_filter=False, lower=True, window=3) key = [] origin = [] with open("/home/ubuntu/HITChat/userarticle.txt","r") as f: for line in f.readlines(): origin.append(line.strip()) for word in tr4w.get_keywords(10, word_min_len=2): key.append(word) if len(origin) <= 200: with open("/home/ubuntu/HITChat/userarticle.txt","a") as f: for i in range(len(key)): if key[i] not in origin: f.write(key[i]+"\n") return key
def get_chinese_keywords(text): tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=5) keywords_ = [] for item in tr4w.get_keywords(10, word_min_len=2): # print(item.word, item.weight) keywords_.append(item.word) # print() # print( '关键短语:' ) keyphrases = [] for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): keyphrases.append(phrase) return keywords_, keyphrases
def fit_transform(self, theta=0.5): datMat = self.loadData(self.data) word_segmentation = [] for i in range(len(datMat)): word_segmentation.append(self.word_segment(datMat[i])) print( "............................................................................................" ) print('文本已经分词完毕 !') # 得到文本数据的空间向量表示 corpus_tfidf = self.get_Tfidf_vector_representation(word_segmentation) # corpus_tfidf = self.get_Doc2vec_vector_representation(word_segmentation) dictTopic, clusterTopic = self.single_pass(corpus_tfidf, datMat, theta) print( "............................................................................................" ) print("得到的主题数量有: {} 个 ...".format(len(dictTopic))) print( "............................................................................................\n" ) # 按聚类语句数量对主题进行排序,找到重要的聚类群 clusterTopic_list = sorted(clusterTopic.items(), key=lambda x: len(x[1]), reverse=True) for k in clusterTopic_list[:30]: cluster_title = '\n'.join(k[1]) # print(''.join(cluster_title)) # 得到每个聚类中的的主题关键词 word = TextRank4Keyword() word.analyze(''.join(self.word_segment(''.join(cluster_title))), window=5, lower=True) w_list = word.get_keywords(num=10, word_min_len=2) sentence = TextRank4Sentence() sentence.analyze('\n'.join(k[1]), lower=True) s_list = sentence.get_key_sentences(num=3, sentence_min_len=5)[:30] print("【主题索引】:{} \n【主题声量】:{} \n【主题关键词】: {} \n【主题中心句】 :\n{}".format( k[0], len(k[1]), ','.join([i.word for i in w_list]), '\n'.join([i.sentence for i in s_list]))) print('\n') print("【标题】:", '\n'.join([content[:20] for content in k[1]])) print( "-------------------------------------------------------------------------" )
def text_keyword_abstract(article, keywords_len, sentences_len): tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=2): keywords.append(item.word) keywords = ' '.join(keywords) sentences = article.split('.') first_sentence = sentences[0] tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): if item.sentence != first_sentence: abstract.append(item.sentence + '.') abstract = '\n'.join(abstract) return keywords #, abstract
def keywords_extraction(text, num, word_min_len): tr4w = TextRank4Keyword(allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz']) # allow_speech_tags --词性列表,用于过滤某些词性的词 tr4w.analyze(text=text, window=2, lower=True, vertex_source='all_filters', edge_source='no_stop_words', pagerank_config={'alpha': 0.85, }) # text -- 文本内容,字符串 # window -- 窗口大小,int,用来构造单词之间的边。默认值为2 # lower -- 是否将英文文本转换为小写,默认值为False # vertex_source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点 # -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters' # edge_source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边 # -- 默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数 # pagerank_config -- pagerank算法参数配置,阻尼系数为0.85 keywords = tr4w.get_keywords(num=num, word_min_len=word_min_len) # num -- 返回关键词数量 # word_min_len -- 词的最小长度,默认值为1 return keywords
def nlp(contents): tr4w = TextRank4Keyword() tr4w.analyze(text=''.join(i for i in contents), lower=True, window=2) tr4s = TextRank4Sentence() tr4s.analyze(text=''.join(i for i in contents), lower=True, source='all_filters') keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)] keyphase = [ item for item in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2) ] keysentence = [item for item in tr4s.get_key_sentences(num=3)] return keyword, keyphase, keysentence
def sina_keyword_abstract(article, keywords_len, sentences_len): # 抽取关键词 tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=1): keywords.append(item.word) # 抽取摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): if str(item.sentence).startswith('原标题') or str(item.sentence).startswith('责任编辑') or str( item.sentence).startswith('来源'): continue abstract.append(item.sentence + '。') abstract = '\n'.join(abstract) return keywords, abstract
def contain_word(query, key_query): import_html_dir = Path.cwd().parent / 'html' / query index_csv_dir = Path.cwd().parent / 'csv' / f'{query}.csv' export_dir = Path.cwd().parent / 'category' if not export_dir.exists(): export_dir.mkdir() export_html_dir = Path.cwd().parent / 'category' / query if not export_html_dir.exists(): export_html_dir.mkdir() index = pd.read_csv(index_csv_dir) for key, value in key_query.items(): with open(os.path.join(f'{key}.csv'), 'w', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerow(["title", "url"]) for i in range(len(index)): url = index.iloc[i, 2] title = index.iloc[i, 1] html_file = codecs.open(os.path.join(import_html_dir, f'{title}.html'), 'r', 'utf-8').read() end = 'media_tool_meta tips_global_primary meta_primary' html_file = html_file[:html_file.rfind(end)] text = BeautifulSoup(html_file, features="lxml").get_text() #转化为txt tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=2) list_key = [] for key_word in tr4w.get_keywords(10, word_min_len=2): list_key.append(key_word.word) textrank = analyse.textrank keywords = textrank(text, 10) for keyword in keywords: list_key.append(keyword) list_key = set(list_key) print(title, list_key) for key, value in key_query.items(): if not list_key.isdisjoint(value): print(title, key) with open(os.path.join(export_html_dir, f'{key}.csv'), 'a', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerows([(title, url)])
def Keyword(): text = codecs.open('/Users/liamtheron/Desktop/Deloiite/test.txt', 'r', encoding='utf-8').read() tr4w = TextRank4Keyword() tr4s = TextRank4Sentence() tr4w.analyze(text=text, lower=True, window=2) tr4s.analyze(text=text, lower=True) print('<关键词>:') for item in tr4w.get_keywords(20, word_min_len=1): print(item.word, item.weight) print() print('<关键短语>:') for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): print(phrase) print() print('<摘要>:') for item in tr4s.get_key_sentences(num=3): print(item.index, item.weight, item.sentence)
def zhihu_exec(): Qlink = request.form.get('Qlink', '') Client = zhihu_oauth.ZhihuClient() Client.load_token('token.pkl') me = Client.me() question = Client.question(int(Qlink)) with open(os.path.join(os.path.join('projhorus','static'),'question_%s_result.txt' % Qlink),'w') as f: for i in question.answers: tr4w = TextRank4Keyword() tr4w.analyze(text=i.content, lower=True, window=2) f.write(u'<--------------------关键词-------------------->\n') for item in tr4w.get_keywords(20, word_min_len=1): f.write(str(item.word) +' '+ str(item.weight)+'\n') f.write(u'<--------------------关键短语-------------------->\n') for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): f.write(phrase+'\n') global FLAG FLAG = '/static/question_%s_result.txt' % Qlink return redirect('/')
def textrank_keyWords(cat): df_u8 = pd.read_csv('contents/' + cat + '_u8.csv', encoding="utf_8_sig") df_gb = pd.read_csv('contents/' + cat + '_gb.csv', encoding="GB2312") content = [] for i in range(len(df_gb['content'])): tempStr = str(df_gb['title'][i]).strip() + ' ' + str( df_gb['content'][i]).strip() content.append(tempStr) for i in range(len(df_u8['content'])): content.append( str(df_u8['title'][i]).strip() + ' ' + str(df_u8['content'][i]).strip()) words = {} tr4w = TextRank4Keyword(stop_words_file='stopWords.txt') randomList = [] for i in range(500): randomList.append(random.randint(0, len(content) - 1)) for i in randomList: tr4w.analyze(text=content[i], lower=True, window=2) for item in tr4w.get_keywords(5, word_min_len=1): if item.word not in stopList and item.word not in specStop: if item.word not in words.keys(): words[item.word] = item.weight else: words[item.word] = words[item.word] + item.weight sorted_by_value = sorted(words.items(), key=lambda kv: kv[1], reverse=True) #print(cat + ':') tags = [] frequency = [] for l in sorted_by_value[0:20]: f = l[1] w = l[0] frequency.append(float(f) * 1000) tags.append(w) total_text = {} for i in range(len(tags)): total_text[tags[i]] = int(frequency[i]) wordcloud = WordCloud(font_path='C:/WINDOWS/Fonts/STKAITI.TTF', background_color='white').fit_words(total_text) wordcloud.to_file('wordcloud/' + cat + '_tr.jpg')
def action_two(): import pandas as pd from textrank4zh import TextRank4Keyword, TextRank4Sentence news = pd.read_table('textrank/news.txt', encoding='GB18030', header=None) strings = '' for index in range(news.shape[0]): strings += news.loc[index, 0] tr4w = TextRank4Keyword() tr4w.analyze(text=strings, lower=True, window=3) print('关键词:') for item in tr4w.get_keywords(20, word_min_len=2): print(item.word, item.weight) tr4s = TextRank4Sentence() tr4s.analyze(text=strings, lower=True, source='all_filters') print('摘要:') # 重要性较高的三个句子 for item in tr4s.get_key_sentences(num=3): print(item.weight, item.sentence)
def get_summary(self, data, flag=0): text = "".join(data) if flag == 0: tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=2) #ret = tr4w.get_keywords() ret = tr4w.get_keyphrases(keywords_num=12, min_occur_num=0) if len(ret) > 0: return ret[0] else: return "" else: tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True) ret = tr4s.get_key_sentences(num=6, sentence_min_len=4) if len(ret) >= 0: return ret[0]['sentence'] else: return ""
def textSummary(fileName, finalName): text = codecs.open(fileName, 'r', 'utf-8').read() tr4w = TextRank4Keyword() tr4w.analyze( text=text, lower=True, window=2 ) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 # 关键词 tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') sentences = {} for item in tr4s.get_key_sentences(num=50): if len(item.sentence) not in sentences.keys(): sentences[len(item.sentence)] = [] sentences[len(item.sentence)].append(item.sentence) continue sentences[len(item.sentence)].append(item.sentence) key_sentences = [] for i in range(3): key = max(sentences.keys()) # print(sentences[key]) key_sentences.append(sentences[key][0]) sentences.pop(key) text = re.split( pattern= r'[\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee]', string=text) ks = [] for te in text: for k in key_sentences: if str(te) in str(k) and str(k) not in ks: ks.append(k) line = '' for k in ks: line += k + " " fw = codecs.open(finalName + "output.txt", 'a', 'utf-8') fw.write(line)
def get_textrank4zh_keywords_phrase(contents): """ 获取文本关键字短语,这个功能有点不完善,不好用 :param contents: string :return: dict of list [{x},{x}] """ # 定义返回前20个关键词短语 topK = 20 tr4w = TextRank4Keyword() tr4w.analyze(text=contents, lower=True) # logger.info('使用textrank4zh提取关键词短语,默认提取20个') # print('关键短语:') # for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): # print(phrase) result = tr4w.get_keyphrases(keywords_num=topK, min_occur_num=2) return result
def generate_keywords(text, number): """ textrank4zh: 针对中文文本的TextRank算法实现,用于提取关键词和摘要。 https://github.com/letiantian/TextRank4ZH pyhanlp: HanLP自然语言处理工具包的python接口,实现TextRank关键词提取算法。 https://github.com/hankcs/pyhanlp jieba: jieba分词工具包,实现TF-IDF算法,TextRank算法的关键词抽取。 https://github.com/fxsjy/jieba :param text: 文本 :param number: 关键词个数 :return: 关键词字典 """ keywords = {} tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=2) for item in tr4w.get_keywords(number, word_min_len=1): keywords.setdefault('textrankzh', []).append(item.word) tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') abstract = "" TextRankKeyword = JClass('com.hankcs.hanlp.summary.TextRankKeyword') for item in tr4s.get_key_sentences(num=2): abstract += item.sentence tr4w.analyze(text=abstract, lower=True, window=2) for item in tr4w.get_keywords(number, word_min_len=1): keywords.setdefault('textrank_abs', []).append(item.word) for item in HanLP.extractKeyword(text, number): keywords.setdefault('hanlp', []).append(item) for item in HanLP.extractKeyword(abstract, number): keywords.setdefault('hanlp_abs', []).append(item) for item in jieba.analyse.extract_tags(text, topK=number, withWeight=False, allowPOS=()): keywords.setdefault('tfidf', []).append(item) for item in jieba.analyse.textrank(text, topK=number, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')): keywords.setdefault('jiebarank', []).append(item) for item in jieba.analyse.extract_tags(abstract, topK=number, withWeight=False, allowPOS=()): keywords.setdefault('tfidf_abs', []).append(item) for item in jieba.analyse.textrank(abstract, topK=number, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')): keywords.setdefault('jiebarank_abs', []).append(item) return keywords
def extract_key_information(self, num_key_word=30, num_key_phrase=20, num_key_sentence=5): text = ''.join(self.article_list) # 创建分词类的实例 tr4w = TextRank4Keyword() # 对文本进行分析,设定窗口大小为2,并将英文单词小写 tr4w.analyze(text=text, lower=True, window=2) with open(self.rule_reference_filename, "a") as f: # 从关键词列表中获取前20个关键词 f.write( '###########################关 键 词##################################' + '\n') for item in tr4w.get_keywords(num=num_key_word, word_min_len=1): if item.word in self.stopwords or item.word in self.filter_dictionary: continue else: f.write(item.word + '\t' + str(item.weight) + '\n') with open(self.rule_reference_filename, "a") as f: # 从关键短语列表中获取20个关键短语 f.write( '##########################关 键 短 语##################################' + '\n') for phrase in tr4w.get_keyphrases(keywords_num=num_key_phrase, min_occur_num=2): f.write(phrase + '\n') # 创建分句类的实例 tr4s = TextRank4Sentence() # 英文单词小写,进行词性过滤并剔除停用词 tr4s.analyze(text=text, lower=True, source='all_filters') with open(self.rule_reference_filename, "a") as f: # 从关键短语列表中获取5关键短语 f.write( '###########################关 键 句##################################' + '\n') for item in tr4s.get_key_sentences(num=num_key_sentence): f.write( str(item.index) + str(item.weight) + str(item.sentence) + '\n') f.write('----------------' + '\n')
def textRank_ppt(url,num_abs): """ :param url: :param num_abs: 生成ppt张数 :return: """ title,texts = article_extract(url) tr4w = TextRank4Keyword() tr4w.analyze(text=texts ,lower=True, window=2) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 tr4s = TextRank4Sentence() tr4s.analyze(text=texts, lower=True, source = 'all_filters') print( '关键词:' ) key_words = "" for item in tr4w.get_keywords(7, word_min_len=2): print(item.word, item.weight) key_words = key_words + item.word + "\n" #ppt generate prs = Presentation() slide1, body_shape1 = ppt1.add_slide(prs=prs,slide_title=title, style_number=0) slide2, body_shape2 = ppt1.add_slide(prs=prs, style_number=1,slide_title="关键词",content="") ppt1.add_paragraph(body_shape2,text=key_words,size=20) i = 0 #图片生成,并添加到ppt中 extract_image.pic_extract(url) print("句子:") for item in tr4s.get_key_sentences(num=(num_abs-2)*2): if i % 2 == 0: slide3, body_shape3 = ppt1.add_slide(prs=prs, style_number=1,slide_title="摘要",content="") try: ppt1.add_picture(slide2=slide3,pic_path="image1/image_"+str(i)+".jpg") except: print("no picture") i += 1 # print(len(item.sentence),item.index) ppt1.add_paragraph(body_shape3,text=item.sentence,size=20) prs.save('test.pptx') print("ppt 已生成")
def extractKeywordFromUser(sentence, targetNum): textrank = TextRank4Keyword() s = sentence textrank.analyze(text=s, lower=True, window=2) keywords = [ item.word for item in textrank.get_keywords(targetNum, word_min_len=1) ] candidatewords = [] candidatescores = [] if len(keywords) < targetNum: for keyword in keywords: wordlist, scorelist = synonyms.nearby(keyword) candidatewords.extend(wordlist) candidatescores.extend(scorelist) sortedIndex = np.argsort([-i for i in candidatescores]) sortedIndex = [ sortedIndex[i] for i in range(len(sortedIndex)) if not candidatescores[sortedIndex[i]] == 1 ] for i in range(targetNum - len(keywords)): keywords.append(candidatewords[sortedIndex[i]]) return keywords
def main(self, content, label, window=2, k=20): """ The function is used to exract keywords using TextRank. @ param `content` String list of texts, which should be segmented string. @ param `label` String/Integer list indicating the category. @ param `window` count window. @ param `k` top k keywords will be returned, default 10. """ keywords = {} cate_content = {} for c, l in zip(content, label): if l in cate_content: cate_content[l] += " " + c.replace(" ", "") else: cate_content[l] = c.replace(" ", "") for item in cate_content.items(): tr4w = TextRank4Keyword() tr4w.analyze(text=item[1], lower=True, window=window) kws = tr4w.get_keywords(k, word_min_len=1) keywords[item[0]] = [x.word for x in kws] return keywords
def finalGet(content, number=5): text = content tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=2) # 得到五个关键词,能够是用户在列表中通过关键词组来对Note进行辨别 wordList = [] for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): wordList.append(phrase) if len(wordList) < number: for item in tr4w.get_keywords(number - len(wordList), word_min_len=2): wordList.append(item.word) print(wordList) # 得到一个合起来的摘要 summary = [] tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') for item in tr4s.get_key_sentences(num=3): summary.append(item.sentence) print(summary) sumText = '。'.join(summary) print(sumText) return (wordList, sumText)