def summarize(self, text, num=8, alpha=0.6): """ :param text: str :param num: int :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # # 计算每个句子的词语个数 # sen_word_len = [len(sc)+1 for sc in sentences_cut] # 计算每个句子的tfidf sen_tfidf = tfidf_fit(self.sentences_cut) # 矩阵中两两句子相似度 SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度" # 输入文本句子长度 len_sen = len(self.sentences) # 句子标号 sen_idx = [i for i in range(len_sen)] summary_set = [] mmr = {} for i in range(len_sen): if not self.sentences[i] in summary_set: sen_idx_pop = copy.deepcopy(sen_idx) sen_idx_pop.pop(i) # 两两句子相似度 sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop] score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确 mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j) summary_set.append(self.sentences[i]) score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)] if len(mmr) > num: score_sen = score_sen[0:num] return score_sen
def summarize(self, text, num=8, topic_min=6, judge_topic=None): """ LDA :param text: str :param num: int :param topic_min: int :param judge_topic: boolean :return: """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") len_sentences_cut = len(self.sentences) # 切词 sentences_cut = [[ word for word in macropodus_cut(extract_chinese(sentence)) if word.strip() ] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [ list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut ] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # # 计算每个句子的tf # vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words) # tf_ngram = vector_c.fit_transform(self.sentences_cut) # 计算每个句子的tfidf tf_ngram = tfidf_fit(self.sentences_cut) # 主题数, 经验判断 topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3 lda = LatentDirichletAllocation(n_topics=topic_num, max_iter=32, learning_method='online', learning_offset=50., random_state=2019) res_lda_u = lda.fit_transform(tf_ngram.T) res_lda_v = lda.components_ if judge_topic: ### 方案一, 获取最大那个主题的k个句子 ################################################################################## topic_t_score = np.sum(res_lda_v, axis=-1) # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1] # 统计为最大每个主题的句子个数 exist = (res_nmf_h_soft <= 0) * 1.0 factor = np.ones(res_nmf_h_soft.shape[1]) topic_t_count = np.dot(exist, factor) # 标准化 topic_t_count /= np.sum(topic_t_count, axis=-1) topic_t_score /= np.sum(topic_t_score, axis=-1) # 主题最大个数占比, 与主题总得分占比选择最大的主题 topic_t_tc = topic_t_count + topic_t_score topic_t_tc_argmax = np.argmax(topic_t_tc) # 最后得分选择该最大主题的 res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist() res_combine = {} for l in range(len_sentences_cut): res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] ##################################################################################### else: ### 方案二, 获取最大主题概率的句子, 不分主题 res_combine = {} for i in range(len_sentences_cut): res_row_i = res_lda_v[:, i] res_row_i_argmax = np.argmax(res_row_i) res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] num_min = min(num, int(len_sentences_cut * 0.6)) return score_sen[0:num_min]
def summarize(self, text, num=320, topic_min=5, judge_topic='all'): """ :param text: :param num: :return: """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") len_sentences_cut = len(self.sentences) # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # 计算每个句子的tfidf sen_tfidf = tfidf_fit(self.sentences_cut) # 主题数, 经验判断 topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3 svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32) res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T) res_svd_v = svd_tfidf.components_ if judge_topic: ### 方案一, 获取最大那个主题的k个句子 ################################################################################## topic_t_score = np.sum(res_svd_v, axis=-1) # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1] # 统计为最大每个主题的句子个数 exist = (res_nmf_h_soft <= 0) * 1.0 factor = np.ones(res_nmf_h_soft.shape[1]) topic_t_count = np.dot(exist, factor) # 标准化 topic_t_count /= np.sum(topic_t_count, axis=-1) topic_t_score /= np.sum(topic_t_score, axis=-1) # 主题最大个数占比, 与主题总得分占比选择最大的主题 topic_t_tc = topic_t_count + topic_t_score topic_t_tc_argmax = np.argmax(topic_t_tc) # 最后得分选择该最大主题的 res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist() res_combine = {} for l in range(len_sentences_cut): res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] ##################################################################################### else: ### 方案二, 获取最大主题概率的句子, 不分主题 res_combine = {} for i in range(len_sentences_cut): res_row_i = res_svd_v[:, i] res_row_i_argmax = np.argmax(res_row_i) res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] num_min = min(num, int(len_sentences_cut * 0.6)) return score_sen[0:num_min]