def key_rank(text, topk=18): sents = list(cut_sentence(text)) docs = [list(Tokenize(sent)) for sent in sents] keyword_rank = textrank.KeywordTextRank(docs) keyword_rank.solve() keys = [w for w in keyword_rank.top_index(topk)] return keys
def key_rank(text, topk=18): sents = list(cut_sentence(text)) docs = [list(tokenize2(sent)) for sent in sents] keyword_rank = textrank.KeywordTextRank(docs) keyword_rank.solve() keys = [w for w in keyword_rank.top_index(topk)] return keys
def summarize(txt): sents = list(cppjiebapy.cut_sentence(txt)) docs = [list(Tokenize(sent)) for sent in sents] keys = [] for d in docs: keys += d (top_n_words, cls) = key_words(keys, N_2) top_n_sum = summarize4(sents, docs) return ", ".join(top_n_words[:18]), top_n_sum, cls
def sum_rank(text): sents = list(cut_sentence(text)) docs = [list(tokenize2(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = textrank.TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
def sum_rank(text): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) return u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
def summarize(txt): sents = list(cppjiebapy.cut_sentence(txt)) docs = [list(Tokenize(sent)) for sent in sents] keys = [] for d in docs: keys += d (top_n_words, cls) = key_words(keys, N_2) top_n_sum = summarize4(sents, docs) return ', '.join(top_n_words[:18]), top_n_sum, cls
def test_rank2(obj): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) print 'test_rank2', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '') + u'。'
def test3(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] bm25 = BM25(docs) l = len(sents) test_dense = np.zeros((l,l)) for i in xrange(l): scores = bm25.simall(docs[i]) test_dense[i] = scores print 'bm25=', test_dense
def test3(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] bm25 = BM25(docs) l = len(sents) test_dense = np.zeros((l, l)) for i in xrange(l): scores = bm25.simall(docs[i]) test_dense[i] = scores print 'bm25=', test_dense
def text_rank4(obj): sents = list(cut_sentence(obj.content)) vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize) tfidf = vect.fit_transform(sents) tfidf_graph = tfidf*tfidf.T #print 's ', tfidf_graph.A #print 'o ', np.dot(tfidf.A,tfidf.A.T) nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph) scores = nx.pagerank(nx_graph) res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True) top_n_summary = [sents[i] for _,i in sorted(res[:3])] print 'text_rank4', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
def test1(): obj = HtmlContent.objects.get(pk=46) key, sum, cls = summarize(obj.content) key2 = key_rank(obj.content) sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sum2 = summarize4(sents) print 'key1', key, cls print 'key2', ', '.join(key2) print 'sum1', sum print 'sum2', sum2
def text_rank4(obj): sents = list(cut_sentence(obj.content)) vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize) tfidf = vect.fit_transform(sents) tfidf_graph = tfidf * tfidf.T #print 's ', tfidf_graph.A #print 'o ', np.dot(tfidf.A,tfidf.A.T) nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph) scores = nx.pagerank(nx_graph) res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True) top_n_summary = [sents[i] for _, i in sorted(res[:3])] print 'text_rank4', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '') + u'。'
def test2(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res,(l,l)) print 'lsi=', sim_res
def test1(): obj = HtmlContent.objects.get(pk=46) key,sum,cls = summarize(obj.content) key2 = key_rank(obj.content) sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] sum2 = summarize4(sents) print 'key1', key, cls print 'key2',', '.join(key2) print 'sum1',sum print 'sum2',sum2
def test_rank3(obj): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize) tfidf = vect.fit_transform(sents) lsa = TruncatedSVD(5) lsa_res = lsa.fit_transform(tfidf) lsa_res = Normalizer(copy=False).fit_transform(lsa_res) tfidf_graph = np.dot(lsa_res,lsa_res.T) tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph)) nx_graph = nx.from_numpy_matrix(tfidf_graph) scores = nx.pagerank(nx_graph) res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True) top_n_summary = [sents[i] for _,i in sorted(res[:3])] print 'test_rank3', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
def summarize3(txt): sentences = [] for s in cppjiebapy.cut_sentence(txt): sentences.append(s.lower()) normalized_sentences = [s.lower() for s in sentences] (top_n_words, _) = key_words(txt, N_3) scored_sentences = __score_sentences(normalized_sentences, top_n_words) avg_list = [s[1] for s in scored_sentences] avg = np.mean(avg_list) std = np.std(avg_list) # avg,std = _mean_std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored] return u"。 ".join(mean_scored_summary) + u"。 "
def test2(): obj = HtmlContent.objects.get(pk=34) sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a, test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res, (l, l)) print 'lsi=', sim_res
def test_rank3(obj): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize) tfidf = vect.fit_transform(sents) lsa = TruncatedSVD(5) lsa_res = lsa.fit_transform(tfidf) lsa_res = Normalizer(copy=False).fit_transform(lsa_res) tfidf_graph = np.dot(lsa_res, lsa_res.T) tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph)) nx_graph = nx.from_numpy_matrix(tfidf_graph) scores = nx.pagerank(nx_graph) res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True) top_n_summary = [sents[i] for _, i in sorted(res[:3])] print 'test_rank3', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '') + u'。'
def summarize3(txt): sentences = [] for s in cppjiebapy.cut_sentence(txt): sentences.append(s.lower()) normalized_sentences = [s.lower() for s in sentences] (top_n_words, _) = key_words(txt, N_3) scored_sentences = __score_sentences(normalized_sentences, top_n_words) avg_list = [s[1] for s in scored_sentences] avg = np.mean(avg_list) std = np.std(avg_list) #avg,std = _mean_std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored] return u'。 '.join(mean_scored_summary) + u'。 '
def test_rank1(obj): sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res,(l,l)) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in rank.top_index(5): top_n_summary.append(sents[index]) print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')
def test_rank1(obj): sents = list(cut_sentence(obj.content)) docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents] num_terms = 400 test_corpus = lsi_model[tfidf_model[docs]] test_dense = matutils.corpus2dense(test_corpus, num_terms).T test_a = [a for a in test_dense] sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a, test_a)), dtype=np.float) l = len(sents) sim_res = np.reshape(sim_res, (l, l)) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in rank.top_index(5): top_n_summary.append(sents[index]) print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r', '').replace( '\n', '')
def test5(): for obj in HtmlContent.objects.filter(~Q(content='')): sents = list(cut_sentence(obj.content)) docs = [list(Tokenize(sent)) for sent in sents] obj.summerize = summarize4(sents, docs)[0:400] obj.save()