Esempio n. 1
0
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u"。 ".join(top_n_summary).replace("\r", "").replace("\n", "") + u"。"
Esempio n. 2
0
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
Esempio n. 3
0
 def get_summary(self, texts, n=3):
     texts = self.get_sentences(texts)
     doc_sents = [jieba.lcut(i) for i in texts]
     rank = TextRank(doc_sents)
     rank.text_rank()
     results = []
     for j in range(len(texts)):
         if j in rank.top_index(n):
             results.append(texts[j])
     summary = "。".join(results) + "。"
     return summary
Esempio n. 4
0
import jieba
# from bm25 import BM25
from textrank import TextRank
import utils
from snownlp import seg
from sys import argv

fact = argv[1]
# fact = '公诉机关指控:2016年3月28日20时许,被告人颜某在本市洪山区马湖新村足球场马路边捡拾到被害人谢某的VIVOX5手机一部,' \
#       '并在同年3月28日2、1时起,分多次通过支付宝小额免密支付功能,秘密盗走被害人谢某支付宝内人民币3723元。案发后,被告人颜某家属已赔偿被害人全部损失,' \
#       '并取得谅解。公诉机关认为被告人颜某具有退赃、取得谅解、自愿认罪等处罚情节,建议判处被告人颜某一年以下××、××或者××,并处罚金。'
if __name__ == '__main__':

    sents = utils.get_sentences(fact)
    doc = []
    for sent in sents:
        words = seg.seg(sent)
        # words = list(jieba.cut(sent))
        words = utils.filter_stop(words)
        doc.append(words)
    # print(doc)
    # s = BM25(doc)
    # print(s.f)
    # print(s.df)
    # print(s.idf)

    rank = TextRank(doc)
    rank.text_rank()
    for index in rank.top_index(3):
        print(sents[index])
Esempio n. 5
0
class Order:
    def __init__(self, text, seg=None, tagger=None):
        self.text = text
        self.tagger = tagger if tagger is not None else self.get_tagger()
        self.seg = seg if seg is not None else self.get_seg()
        self.words_merge = None

    def get_keywords(self, limit=5, merge=False):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.keywordrank = KeywordRank(doc)
        self.keywordrank.solve()
        result = []
        for w in self.keywordrank.top_index(limit):
            result.append(w)

        if merge:
            wm = self.words_merge.merge(self.text, result)
            return wm.merge()
        return result

    def get_summaries(self, limit=5):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.textrank = TextRank(doc)
        self.textrank.solve()
        result = []
        for index in self.textrank.top_index(limit):
            result.append(sentences[index])
        return result

    def get_sentences(self):
        line_break_re = re.compile('[\r\n]')
        delimiter_re = re.compile('[,。?!;]')
        sentences = []
        for line in line_break_re.split(self.text):
            line = line.strip()
            if not line:
                continue

            for sentence in delimiter_re.split(line):
                sentence = sentence.strip()
                if not sentence:
                    continue
                sentences.append(sentence)

        return sentences

    def get_seg(self, fname='seg.pickle'):
        seg = Seg()
        seg.load(fname)
        return seg

    def get_tagger(self, fname='tag.pickle'):
        tagger = Tag()
        tagger.load(fname)
        return tagger

    def filter_stop(self, words):
        return list(filter(lambda x: x not in stop_words, words))