def summarize4(sents, docs=None): if not docs: docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) return u"。 ".join(top_n_summary).replace("\r", "").replace("\n", "") + u"。"
def summarize4(sents, docs=None): if not docs: docs = [list(Tokenize(sent)) for sent in sents] sim_res = bm25_weights(docs) rank = TextRank(sim_res) rank.solve() top_n_summary = [] for index in sorted(rank.top_index(3)): top_n_summary.append(sents[index]) return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
def get_summary(self, texts, n=3): texts = self.get_sentences(texts) doc_sents = [jieba.lcut(i) for i in texts] rank = TextRank(doc_sents) rank.text_rank() results = [] for j in range(len(texts)): if j in rank.top_index(n): results.append(texts[j]) summary = "。".join(results) + "。" return summary
import jieba # from bm25 import BM25 from textrank import TextRank import utils from snownlp import seg from sys import argv fact = argv[1] # fact = '公诉机关指控:2016年3月28日20时许,被告人颜某在本市洪山区马湖新村足球场马路边捡拾到被害人谢某的VIVOX5手机一部,' \ # '并在同年3月28日2、1时起,分多次通过支付宝小额免密支付功能,秘密盗走被害人谢某支付宝内人民币3723元。案发后,被告人颜某家属已赔偿被害人全部损失,' \ # '并取得谅解。公诉机关认为被告人颜某具有退赃、取得谅解、自愿认罪等处罚情节,建议判处被告人颜某一年以下××、××或者××,并处罚金。' if __name__ == '__main__': sents = utils.get_sentences(fact) doc = [] for sent in sents: words = seg.seg(sent) # words = list(jieba.cut(sent)) words = utils.filter_stop(words) doc.append(words) # print(doc) # s = BM25(doc) # print(s.f) # print(s.df) # print(s.idf) rank = TextRank(doc) rank.text_rank() for index in rank.top_index(3): print(sents[index])
class Order: def __init__(self, text, seg=None, tagger=None): self.text = text self.tagger = tagger if tagger is not None else self.get_tagger() self.seg = seg if seg is not None else self.get_seg() self.words_merge = None def get_keywords(self, limit=5, merge=False): doc = [] sentences = self.get_sentences() for sentence in sentences: words = list(self.seg.seg(sentence)) words = self.filter_stop(words) doc.append(words) self.keywordrank = KeywordRank(doc) self.keywordrank.solve() result = [] for w in self.keywordrank.top_index(limit): result.append(w) if merge: wm = self.words_merge.merge(self.text, result) return wm.merge() return result def get_summaries(self, limit=5): doc = [] sentences = self.get_sentences() for sentence in sentences: words = list(self.seg.seg(sentence)) words = self.filter_stop(words) doc.append(words) self.textrank = TextRank(doc) self.textrank.solve() result = [] for index in self.textrank.top_index(limit): result.append(sentences[index]) return result def get_sentences(self): line_break_re = re.compile('[\r\n]') delimiter_re = re.compile('[,。?!;]') sentences = [] for line in line_break_re.split(self.text): line = line.strip() if not line: continue for sentence in delimiter_re.split(line): sentence = sentence.strip() if not sentence: continue sentences.append(sentence) return sentences def get_seg(self, fname='seg.pickle'): seg = Seg() seg.load(fname) return seg def get_tagger(self, fname='tag.pickle'): tagger = Tag() tagger.load(fname) return tagger def filter_stop(self, words): return list(filter(lambda x: x not in stop_words, words))