Esempio n. 1
0
    def get_keywords(self, text, keywords_num=None):
        self.__text = text  #original raw text
        # text = text.replace('\n', '')
        # text = text.replace('\r', '')
        text = util.as_text(text)  #处理编码问题
        sentences = util.cut_sentences(
            text)  # this text here means a full text content ; all in one line
        #sentences用于记录已分词文章最原本的句子,wordlist_sents用于提取关键词
        sentences, wordlist_sents = util.psegcut_filter_words(
            sentences, self.__stop_words, self.__use_stopword)

        self.__sentences = sentences

        word_index, index_word, words_number = self.build_worddict(
            wordlist_sents)
        graph = self.build_word_grah(wordlist_sents,
                                     words_number,
                                     word_index,
                                     window=self.__window)
        scores = util.weight_map_rank(graph,
                                      max_iter=self.__max_iter,
                                      tol=self.__tol)
        if keywords_num is None:
            keywords_num = int(words_number / 3)
            keywords_selected = nlargest(keywords_num, zip(scores, count()))
        else:
            keywords_selected = nlargest(keywords_num, zip(scores, count()))

        return [(index_word[item[1]], item[0]) for item in keywords_selected
                ]  # item: [(score,word_index) list]
Esempio n. 2
0
 def summarize(self,text,n):
     text = text.replace('\n', '')
     text = text.replace('\r', '')
     text = util.as_text(text)#处理编码问题
     tokens=util.cut_sentences(text)
     #sentences用于记录文章最原本的句子,sents用于各种计算操作
     sentences, sents=util.cut_filter_words(tokens,self.__stop_words,self.__use_stopword)
     if self.__use_w2v:
         sents = self.filter_dictword(sents)
     graph = self.create_graph_sentence(sents,self.__use_w2v)
     scores = util.weight_map_rank(graph,self.__max_iter,self.__tol)
     sent_selected = nlargest(n, zip(scores, count()))
     sent_index = []
     for i in range(n):
         sent_index.append(sent_selected[i][1])  # 添加入关键词在原来文章中的下标
     return [sentences[i] for i in sent_index]
    def summarize(self,text,n):
        text = text.replace('\n', '')
        text = text.replace('\r', '')
        text = util.as_text(text)#处理编码问题
        tokens=util.cut_sentences(text)
        #sentences用于记录文章最原本的句子,sents用于各种计算操作
        sentences,sents=util.psegcut_filter_words(tokens,self.__stop_words,self.__use_stopword)

        word_index, index_word, words_number=self.build_worddict(sents)
        graph=self.build_word_grah(sents,words_number,word_index,window=self.__window)
        scores = util.weight_map_rank(graph,max_iter=self.__max_iter,tol=self.__tol)
        sent_selected = nlargest(n, zip(scores, count()))
        sent_index = []
        for i in range(n):
            sent_index.append(sent_selected[i][1])  # 添加入关键词在原来文章中的下标
        return [index_word[i] for i in sent_index]