def count_word(self, text, use_type="text"): """ 词频统计(句子/段落/文章) :param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt" :param use_type: str, "text" or "file", file of "utf-8" of "txt" :return: class<Counter>, word-freq """ self.words_count = Counter() if use_type == "text": # 输入为文本形式 texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: n_grams = get_ngrams(use_type=self.algorithm, len_max=self.len_max, text=text) # 获取一个句子的所有n-gram self.words_count.update(n_grams) elif use_type == "file": # 输入为文件形式 if not os.path.exists(text): raise RuntimeError("path of text must exist!") fr8 = open(text, "r", encoding="utf-8") for text in fr8: if text.strip(): texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: n_grams = get_ngrams(use_type=self.algorithm, len_max=self.len_max, text=text) # 获取一个句子的所有n-gram self.words_count.update(n_grams) fr8.close() else: raise RuntimeError("use_type must be 'text' or 'file'") self.total_words = sum(self.words_count.values())
def summarize(self, text, type_l='mix', num=320): """ lead-s :param sentences: list :param type: str, you can choose 'begin', 'end' or 'mix' :return: list """ # 切句 if type(text) == str: sentences = cut_sentence(text) elif type(text) == list: sentences = text else: raise RuntimeError("text type must be list or str") # 最小句子数 num_min = min(num, len(sentences)) if type_l == 'begin': summers = sentences[0:num] elif type_l == 'end': summers = sentences[-num:] else: summers = [sentences[0]] + [sentences[-1]] + sentences[1:num - 1] summers_s = {} for i in range(len(summers)): # 得分计算 if len(summers) - i == 1: summers_s[summers[i]] = (num - 0.75) / (num + 1) else: summers_s[summers[i]] = (num - i - 0.5) / (num + 1) score_sen = [(rc[1], rc[0]) for rc in sorted( summers_s.items(), key=lambda d: d[1], reverse=True)][0:num_min] return score_sen
def summarize(self, text, num=320): # 切句 if type(text) == str: sentences = cut_sentence(text) elif type(text) == list: sentences = text else: raise RuntimeError("text type must be list or str") # str of sentence >>> index corpus = _build_corpus(sentences) # pagerank and so on most_important_docs = summarize_corpus(corpus) count = 0 sentences_score = {} for cor in corpus: tuple_cor = tuple(cor) sentences_score[sentences[count]] = most_important_docs[tuple_cor] count += 1 # 最小句子数 num_min = min(num, int(len(sentences) * 0.6)) score_sen = [(rc[1], rc[0]) for rc in sorted( sentences_score.items(), key=lambda d: d[1], reverse=True) ][0:num_min] return score_sen
def summarize(self, text, num=320, type_sim="cosine", type_encode="avg", config={ "alpha": 0.86, "max_iter": 100 }): """ 文本摘要抽取, textrank of word2vec cosine :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。" :param num: int, length of sentence like 6 :param type_sim: str, type of simiilarity. like "total", "cosine" :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100} :return: list, result of keyword. like [(0.06900223298930287, 'PageRank The PageRank Citation Ranking'), (0.08698940285163381, 'PageRank通过网络浩瀚的超链接关系来确定一个页面的等级')] """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") # 输入文本句子长度 len_sen = len(self.sentences) # 构建图的顶点 sent2idx = {} idx2sent = {} sent_idx = 0 for sent in self.sentences: sent2idx[sent] = sent_idx idx2sent[sent_idx] = sent sent_idx += 1 graph_sents = np.zeros((sent_idx, sent_idx)) # 构建图的边, 以两个句子的余弦相似度为基础 for i in range(len_sen): for j in range(len_sen): score_w2v_cosine = self.similarity(self.sentences[i], self.sentences[j], type_sim=type_sim, type_encode=type_encode) graph_sents[i][j] = score_w2v_cosine graph_sents[j][i] = score_w2v_cosine # 构建相似度矩阵 w2v_cosine_sim = nx.from_numpy_matrix(graph_sents) # nx.pagerank sens_scores = nx.pagerank(w2v_cosine_sim, **config) # 得分排序 sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) # 保留topk个, 防止越界 topk = min(len(sen_rank), num) # 返回原句子和得分 return [(sr[1], self.sentences[sr[0]]) for sr in sen_rank][0:topk]
def summarize(self, text, num=320, title=None): # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") self.title = title if self.title: self.title = macropodus_cut(title) # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] # 词频统计 self.words = [] for sen in self.sentences_cut: self.words = self.words + sen self.word_count = dict(Counter(self.words)) # word_count_rank = sorted(word_count.items(), key=lambda f:f[1], reverse=True) # self.word_freqs = [{'word':wcr[0], 'freq':wcr[1]} for wcr in word_count_rank] # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}] self.word_freqs = {} self.len_words = len(self.words) for k, v in self.word_count.items(): self.word_freqs[k] = v * 0.5 / self.len_words # 句子位置打分 scores_posi = self.score_position() res_rank = {} self.res_score = [] for i in range(len(sentences_cut)): sen = self.sentences[i] # 句子 sen_cut = self.sentences_cut[i] # 句子中的词语 score_sbs = self.score_sbs(sen_cut) # 句子中的词语打分1 score_dbs = self.score_dbs(sen_cut) # 句子中的词语打分2 score_word = (score_sbs + score_dbs) * 10.0 / 2.0 # 句子中的词语打分mix score_length = self.score_length(sen) # 句子文本长度打分 score_posi = scores_posi[i] if self.title: # 有标题的文本打分合并 score_title = self.score_title(sen_cut) score_total = (score_title * 0.5 + score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 4.0 # 可查阅各部分得分统计 self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "score_title", "sentences"]) self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, score_title, self.sentences[i]]) else: # 无标题的文本打分合并 score_total = (score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 3.5 self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "sentences"]) self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, self.sentences[i].strip()]) res_rank[self.sentences[i].strip()] = score_total # 最小句子数 num_min = min(num, int(len(self.word_count) * 0.6)) score_sen = [(rc[1], rc[0]) for rc in sorted(res_rank.items(), key=lambda d: d[1], reverse=True)][0:num_min] return score_sen
def summarize(self, text, num=320): """ 根据词语意义确定中心句 :param text: str :param num: int :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] # 词频统计 self.words = [] for sen in self.sentences_cut: self.words = self.words + sen self.word_count = dict(Counter(self.words)) self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True) # 最小句子数 num_min = min(num, int(len(self.word_count)*0.6)) # 词语排序, 按照词频 self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min] res_sentence = [] # 抽取句子, 顺序, 如果词频高的词语在句子里, 则抽取 for word in self.word_rank: for i in range(0, len(self.sentences)): # 当返回关键句子到达一定量, 则结束返回 if len(res_sentence) < num_min: added = False for sent in res_sentence: if sent == self.sentences[i]: added = True if (added == False and word in self.sentences[i]): res_sentence.append(self.sentences[i]) break # 只是计算各得分,没什么用 len_sentence = len(self.sentences) res_sentence = [(1-1/(len_sentence+len_sentence/(k+1)), rs) for k, rs in enumerate(res_sentence)] return res_sentence
def summarize(self, text, num=8, alpha=0.6): """ :param text: str :param num: int :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # # 计算每个句子的词语个数 # sen_word_len = [len(sc)+1 for sc in sentences_cut] # 计算每个句子的tfidf sen_tfidf = tfidf_fit(self.sentences_cut) # 矩阵中两两句子相似度 SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度" # 输入文本句子长度 len_sen = len(self.sentences) # 句子标号 sen_idx = [i for i in range(len_sen)] summary_set = [] mmr = {} for i in range(len_sen): if not self.sentences[i] in summary_set: sen_idx_pop = copy.deepcopy(sen_idx) sen_idx_pop.pop(i) # 两两句子相似度 sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop] score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确 mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j) summary_set.append(self.sentences[i]) score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)] if len(mmr) > num: score_sen = score_sen[0:num] return score_sen
def summarize(self, text, num=320): # 切句 if type(text) == str: sentences = cut_sentence(text) elif type(text) == list: sentences = text else: raise RuntimeError("text type must be list or str") # tf-idf相似度 matrix = tdidf_sim(sentences) matrix_norm = TfidfTransformer().fit_transform(matrix) # 构建相似度矩阵 tfidf_sim = nx.from_scipy_sparse_matrix(matrix_norm * matrix_norm.T) # nx.pagerank sens_scores = nx.pagerank(tfidf_sim) # 得分排序 sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) # 保留topk个, 防止越界 topk = min(len(sentences), num) # 返回原句子和得分 return [(sr[1], sentences[sr[0]]) for sr in sen_rank][0:topk]
def summarize(self, text, num=8, topic_min=6, judge_topic=None): """ LDA :param text: str :param num: int :param topic_min: int :param judge_topic: boolean :return: """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") len_sentences_cut = len(self.sentences) # 切词 sentences_cut = [[ word for word in macropodus_cut(extract_chinese(sentence)) if word.strip() ] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [ list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut ] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # # 计算每个句子的tf # vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words) # tf_ngram = vector_c.fit_transform(self.sentences_cut) # 计算每个句子的tfidf tf_ngram = tfidf_fit(self.sentences_cut) # 主题数, 经验判断 topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3 lda = LatentDirichletAllocation(n_topics=topic_num, max_iter=32, learning_method='online', learning_offset=50., random_state=2019) res_lda_u = lda.fit_transform(tf_ngram.T) res_lda_v = lda.components_ if judge_topic: ### 方案一, 获取最大那个主题的k个句子 ################################################################################## topic_t_score = np.sum(res_lda_v, axis=-1) # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1] # 统计为最大每个主题的句子个数 exist = (res_nmf_h_soft <= 0) * 1.0 factor = np.ones(res_nmf_h_soft.shape[1]) topic_t_count = np.dot(exist, factor) # 标准化 topic_t_count /= np.sum(topic_t_count, axis=-1) topic_t_score /= np.sum(topic_t_score, axis=-1) # 主题最大个数占比, 与主题总得分占比选择最大的主题 topic_t_tc = topic_t_count + topic_t_score topic_t_tc_argmax = np.argmax(topic_t_tc) # 最后得分选择该最大主题的 res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist() res_combine = {} for l in range(len_sentences_cut): res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] ##################################################################################### else: ### 方案二, 获取最大主题概率的句子, 不分主题 res_combine = {} for i in range(len_sentences_cut): res_row_i = res_lda_v[:, i] res_row_i_argmax = np.argmax(res_row_i) res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] num_min = min(num, int(len_sentences_cut * 0.6)) return score_sen[0:num_min]
def keyword(self, text, num=6, score_min=0.025, win_size=3, type_sim="total", type_encode="avg", config={ "alpha": 0.86, "max_iter": 100 }): """ 关键词抽取, textrank of word2vec cosine :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。" :param num: int, length of sentence like 6 :param win_size: int, windows size of combine. like 2 :param type_sim: str, type of simiilarity. like "total", "cosine" :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100} :return: list, result of keyword. like [(0.020411696169510562, '手机'), (0.016149784106276977, '夏普')] """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") # macropodus_cut 切词 self.macropodus_word = [ macropodus_cut(sentence) for sentence in self.sentences ] # 去除停用词等 self.sentences_word = [[ w for w in mw if w not in self.stop_words.values() ] for mw in self.macropodus_word] # 构建图的顶点 word2index = {} index2word = {} word_index = 0 for sent_words in self.sentences_word: for word in sent_words: if not word in word2index: # index word2index[word] = word_index index2word[word_index] = word word_index += 1 graph_words = np.zeros((word_index, word_index)) # 构建图的边, 以两个词语的余弦相似度为基础 for sent_words in self.sentences_word: for cw_1, cw_2 in self.cut_window(sent_words, win_size=win_size): if cw_1 in word2index and cw_2 in word2index: idx_1, idx_2 = word2index[cw_1], word2index[cw_2] score_w2v_cosine = self.similarity(cw_1, cw_2, type_sim=type_sim, type_encode=type_encode) graph_words[idx_1][idx_2] = score_w2v_cosine graph_words[idx_2][idx_1] = score_w2v_cosine # 构建相似度矩阵 w2v_cosine_sim = nx.from_numpy_matrix(graph_words) # nx.pagerank sens_scores = nx.pagerank(w2v_cosine_sim, **config) # 得分排序 sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) # 保留topk个, 防止越界 topk = min(len(sen_rank), num) # 返回原句子和得分 return [(sr[1], index2word[sr[0]]) for sr in sen_rank if len(index2word[sr[0]]) > 1 and score_min <= sr[1]][0:topk]
def summarize(self, text, num=320, title=None): """ 文本句子排序 :param docs: list :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") self.title = title if self.title: self.title = macropodus_cut(title) # 切词,含词性标注 self.sentences_tag_cut = [jieba_tag_cut(extract_chinese(sentence)) for sentence in self.sentences] # 词语,不含词性标注 sentences_cut = [[jc for jc in jtc.keys() ] for jtc in self.sentences_tag_cut] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] # 词频统计 self.words = [] for sen in self.sentences_cut: self.words = self.words + sen self.word_count = dict(Counter(self.words)) # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}] self.word_freqs = {} self.len_words = len(self.words) for k, v in self.word_count.items(): self.word_freqs[k] = v * 0.5 / self.len_words # uni_bi_tri_gram特征 [gram_uni, gram_bi, gram_tri] = get_ngrams("".join(self.sentences), ns=[1, 2, 3]) ngrams = gram_uni + gram_bi + gram_tri self.ngrams_count = dict(Counter(ngrams)) # 句子位置打分 scores_posi = self.score_position() # 句子长度打分 scores_length = self.score_length() # 句子词性打分, 名词(1.2)-代词(0.8)-动词(1.0) scores_tag = self.score_tag() res_rank = {} self.res_score = [] for i in range(len(sentences_cut)): sen_cut = self.sentences_cut[i] # 句子中的词语 # ngram得分 [gram_uni_, gram_bi_, gram_tri_] = get_ngrams(self.sentences[i], ns=[1, 2, 3]) # gram_uni_bi_tri(self.sentences[i]) n_gram_s = gram_uni_ + gram_bi_ + gram_tri_ score_ngram = sum([self.ngrams_count[ngs] if ngs in self.ngrams_count else 0 for ngs in n_gram_s]) / (len(n_gram_s) + 1) # 句子中词语的平均长度 score_word_length_avg = sum([len(sc) for sc in sen_cut])/(len(sen_cut)+1) score_posi = scores_posi[i] score_length = scores_length[i] score_tag = scores_tag[i] if self.title: # 有标题的文本打分合并 score_title = self.score_title(sen_cut) score_total = (score_title * 0.5 + score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 6.0 # 可查阅各部分得分统计 self.res_score.append(["score_title", "score_ngram", "score_word_length_avg", "score_length", "score_posi", "score_tag"]) self.res_score.append([score_title, score_ngram, score_word_length_avg, score_length, score_posi, score_tag, self.sentences[i]]) else: # 无标题的文本打分合并 score_total = (score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 5.0 # 可查阅各部分得分统计 self.res_score.append(["score_ngram", "score_word_length_avg", "score_length", "score_posi", "score_tag"]) self.res_score.append([score_ngram, score_word_length_avg, score_length, score_posi, score_tag, self.sentences[i]]) res_rank[self.sentences[i].strip()] = score_total # 最小句子数 num_min = min(num, int(len(self.word_count) * 0.6)) res_rank_sort = sorted(res_rank.items(), key=lambda rr: rr[1], reverse=True) res_rank_sort_reverse = [(rrs[1], rrs[0]) for rrs in res_rank_sort][0:num_min] return res_rank_sort_reverse
def summarize(self, text, num=320, topic_min=5, judge_topic='all'): """ :param text: :param num: :return: """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") len_sentences_cut = len(self.sentences) # 切词 sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] # 计算每个句子的tfidf sen_tfidf = tfidf_fit(self.sentences_cut) # 主题数, 经验判断 topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3 svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32) res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T) res_svd_v = svd_tfidf.components_ if judge_topic: ### 方案一, 获取最大那个主题的k个句子 ################################################################################## topic_t_score = np.sum(res_svd_v, axis=-1) # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1] # 统计为最大每个主题的句子个数 exist = (res_nmf_h_soft <= 0) * 1.0 factor = np.ones(res_nmf_h_soft.shape[1]) topic_t_count = np.dot(exist, factor) # 标准化 topic_t_count /= np.sum(topic_t_count, axis=-1) topic_t_score /= np.sum(topic_t_score, axis=-1) # 主题最大个数占比, 与主题总得分占比选择最大的主题 topic_t_tc = topic_t_count + topic_t_score topic_t_tc_argmax = np.argmax(topic_t_tc) # 最后得分选择该最大主题的 res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist() res_combine = {} for l in range(len_sentences_cut): res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] ##################################################################################### else: ### 方案二, 获取最大主题概率的句子, 不分主题 res_combine = {} for i in range(len_sentences_cut): res_row_i = res_svd_v[:, i] res_row_i_argmax = np.argmax(res_row_i) res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] num_min = min(num, int(len_sentences_cut * 0.6)) return score_sen[0:num_min]