def get_keywords(self, text, keywords_num=None): self.__text = text #original raw text # text = text.replace('\n', '') # text = text.replace('\r', '') text = util.as_text(text) #处理编码问题 sentences = util.cut_sentences( text) # this text here means a full text content ; all in one line #sentences用于记录已分词文章最原本的句子,wordlist_sents用于提取关键词 sentences, wordlist_sents = util.psegcut_filter_words( sentences, self.__stop_words, self.__use_stopword) self.__sentences = sentences word_index, index_word, words_number = self.build_worddict( wordlist_sents) graph = self.build_word_grah(wordlist_sents, words_number, word_index, window=self.__window) scores = util.weight_map_rank(graph, max_iter=self.__max_iter, tol=self.__tol) if keywords_num is None: keywords_num = int(words_number / 3) keywords_selected = nlargest(keywords_num, zip(scores, count())) else: keywords_selected = nlargest(keywords_num, zip(scores, count())) return [(index_word[item[1]], item[0]) for item in keywords_selected ] # item: [(score,word_index) list]
def summarize(self,text,n): text = text.replace('\n', '') text = text.replace('\r', '') text = util.as_text(text)#处理编码问题 tokens=util.cut_sentences(text) #sentences用于记录文章最原本的句子,sents用于各种计算操作 sentences, sents=util.cut_filter_words(tokens,self.__stop_words,self.__use_stopword) if self.__use_w2v: sents = self.filter_dictword(sents) graph = self.create_graph_sentence(sents,self.__use_w2v) scores = util.weight_map_rank(graph,self.__max_iter,self.__tol) sent_selected = nlargest(n, zip(scores, count())) sent_index = [] for i in range(n): sent_index.append(sent_selected[i][1]) # 添加入关键词在原来文章中的下标 return [sentences[i] for i in sent_index]
def summarize(self,text,n): text = text.replace('\n', '') text = text.replace('\r', '') text = util.as_text(text)#处理编码问题 tokens=util.cut_sentences(text) #sentences用于记录文章最原本的句子,sents用于各种计算操作 sentences,sents=util.psegcut_filter_words(tokens,self.__stop_words,self.__use_stopword) word_index, index_word, words_number=self.build_worddict(sents) graph=self.build_word_grah(sents,words_number,word_index,window=self.__window) scores = util.weight_map_rank(graph,max_iter=self.__max_iter,tol=self.__tol) sent_selected = nlargest(n, zip(scores, count())) sent_index = [] for i in range(n): sent_index.append(sent_selected[i][1]) # 添加入关键词在原来文章中的下标 return [index_word[i] for i in sent_index]