コード例 #1
0
def textRank_ppt(url, num_abs):
    """

    :param url:
    :param num_abs: 生成ppt张数
    :return:
    """
    title, texts = article_extract(url)
    tr4w = TextRank4Keyword()
    tr4w.analyze(
        text=texts, lower=True, window=2
    )  # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=texts, lower=True, source='all_filters')
    print('关键词:')
    key_words = ""
    for item in tr4w.get_keywords(7, word_min_len=2):
        print(item.word, item.weight)
        key_words = key_words + item.word + "\n"

    #ppt generate
    prs = Presentation()
    slide1, body_shape1 = ppt1.add_slide(prs=prs,
                                         slide_title=title,
                                         style_number=0)
    slide2, body_shape2 = ppt1.add_slide(prs=prs,
                                         style_number=1,
                                         slide_title="关键词",
                                         content="")
    ppt1.add_paragraph(body_shape2, text=key_words, size=20)
    i = 0
    #图片生成,并添加到ppt中
    extract_image.pic_extract(url)
    print("句子:")
    for item in tr4s.get_key_sentences(num=(num_abs - 2) * 2):

        if i % 2 == 0:
            slide3, body_shape3 = ppt1.add_slide(prs=prs,
                                                 style_number=1,
                                                 slide_title="摘要",
                                                 content="")
            try:
                ppt1.add_picture(slide2=slide3,
                                 pic_path="image1/image_" + str(i) + ".jpg")
            except:
                print("no picture")
        i += 1
        # print(len(item.sentence),item.index)
        ppt1.add_paragraph(body_shape3, text=item.sentence, size=20)
    prs.save('test.pptx')
    print("ppt 已生成")
コード例 #2
0
ファイル: GetAbstract.py プロジェクト: SUNGLITAU/npoms
def get_abstract(data, a, b, c):
    """
    生成excel可以打开的的摘要文件
    :param data: MySQL从取出的数据
    :param a: 起始时间的
    :param b: 结束时间
    :param c: 话题ID
    :return: null
    """
    now_path = os.getcwd()
    path = now_path.replace('\\', '/')
    tr4s = TextRank4Sentence()

    print('当前文章的摘要:')
    results = []
    for i in range(len(data['CONTENT'])):

        # i = re.sub("[^\u4e00-\u9fa5]", '', i)  # 记住只留文本?没有断句算不上摘要,这里需要用其他方式处理
        # print('\u3002 \uff1b \uff0c \uff1a \u201c \u201d'
        #       '\uff08 \uff09 \u3001 \uff1f \u300a \u300b')
        # # 。 ; , : “ ”( ) 、 ? 《 》
        tmp = re.sub(
            "[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b0-9]",
            '', data['CONTENT'][i])
        tr4s.analyze(text=tmp, lower=True)
        result = ''
        # print()
        # print('摘要:')
        for item in tr4s.get_key_sentences(num=3):
            # print(item.index, item.weight, item.sentence)
            result += item.sentence
        if len(result) != 0:
            results.append([
                data['UPTIME'][i], data['TITLE'][i], data['AUTHOR'][i], result
            ])
        # data['CONTENT'][i] = results
    column_name = ['更新时间', '标题/题目', '作者', '摘要']

    tmp_text = pd.DataFrame(columns=column_name, data=results)
    tmp_text.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format(
        c, a, b),
                    encoding='utf_8_sig')
    print('>>>>>>>>>>>>>>  已经保存到csv等待计算或查看  >>>>>>>>>>>>>>>')
    # os.startfile(now_path + '/data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b))  # 弹出工作表
    # 但是这里需要完整的工作路径才行 T:/AC/Python/PublicOpinionMonitor/data/textrank/topic{}_abstract.csv
    # print(results)
    # print(' >>>>>> >>>>>>>>>   将在10秒后关闭excel   >>>>>>> >>>>>>>> ')
    time.sleep(10)
    clear_all_var()
    return
コード例 #3
0
def abstract(fileName, step):
    tr4s = TextRank4Sentence()
    f = open(fileName, 'r')
    text = f.readlines()
    articleLen = countArticleLen(text)
    text = [t.split('\n')[0].strip() for t in text]
    if not judge(text):  # 如果judge返回False则进入合并方法
        text = ''.join(text)
        text = text.replace('……', ',')
        head, middles, last = splitPart(text, step)
        cate, result = mergeAbstract(tr4s, text, head, middles, last)
    else:
        cate, result = paragramAbstarct(tr4s, text)
    return cate, result, articleLen
コード例 #4
0
def _36r_keyword_abstract(article, keywords_len, sentences_len):
    # 抽取关键词
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=1):
        keywords.append(item.word)
    # 抽取摘要
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        abstract.append(item.sentence + '。')
    abstract = '\n'.join(abstract)
    return keywords, abstract
コード例 #5
0
def get_key_sents(cont, num=3):
    '''
    根据内容提取摘要,
    num=3,提取三段摘要,
    return: 下划线拼接提取出的摘要
    '''
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=cont, lower=True, source='all_filters')
    key_sents = []
    summarys = []
    for item in tr4s.get_key_sentences(num=num):
        key_sents.append((item.index, item.weight, item.sentence))
    for s in key_sents:
        summarys.append(s[2])
    return '_'.join(summarys)
コード例 #6
0
def text_rank_subtract(content: str, n=3):
    """
    传入中文字符串,使用text rank方法抽取摘要
    :param content: (str)文章内容
    :param n: 抽取摘要的句子数量n
    :return: summary_sentences(list)摘要句子列表
    """
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=content, lower=True, source='all_filters')
    # 抽取摘要
    summary_sentences = [
        item.sentence for item in tr4s.get_key_sentences(num=n)
    ]

    return summary_sentences
コード例 #7
0
    def cal_key_sentence(self):
        # 输出重要的句子
        tr4s = TextRank4Sentence()
        tr4s.analyze(text=self.text, lower=True, source='all_filters')
        # print('摘要:')
        # 重要性较高的3个句子
        for item in tr4s.get_key_sentences(
                num=self.sentence_num):  # sentence_num是生成关键句的个数
            # index是语句在文本中位置,weight表示权重
            # print('item.index, item.weight, item.sentence',item.index, item.weight, item.sentence)
            # self.import_sentence.append(str(item.index) + ' ' +item.sentence)
            self.import_sentence.append(item.sentence)

        # print('self.import_sentence',self.import_sentence[0][2:] + self.import_sentence[1][2:] + self.import_sentence[2][2:])
        print('self.import_sentence', self.import_sentence)
コード例 #8
0
    def __summary(self, post: dict):
        '''
        Summarize the post's filtered content.

        Args:
            post:
                The dict as a collection of all desired information in a post.
        '''
        print(post['link'], 'nlp and takes time...')
        trw = TextRank4Keyword()
        trw.analyze(post['content_filtered'], lower=True)
        post['keywords'] = [i.word for i in trw.get_keywords(3)]
        trs = TextRank4Sentence()
        trs.analyze(post['content_filtered'], lower=True)
        post['summary'] = [i.sentence for i in trs.get_key_sentences(2)]
コード例 #9
0
ファイル: Topic.py プロジェクト: yangfanjx/SearchEngine
    def topic_paragraph(self):
        with open('./1.txt') as f:
            data = f.read()

        # print(data)
        tr4s = TextRank4Sentence()
        tr4s.analyze(text=data, lower=True, source="all_filters")
        abstract = []
        for item in tr4s.get_key_sentences(num=100):
            if len(item.sentence) < 300:
                abstract.append([item.index, item.sentence])

        abstract = sorted(abstract[:1], key=lambda x: x[0])
        abstract = [
            "(%i) %s \n" % (i, x[i]) for i, x in enumerate(abstract, 1)
        ]
コード例 #10
0
ファイル: abstract.py プロジェクト: xzb123/read_parse_storage
def getAbstarct(text, sentencesNum = 3, lists = False):
    """对text提取摘要,内容为sentencesNum个高权重句子,
    返回结果默认为一个字符串,也可以是N个句子的列表"""
    if sentencesNum <2:
        sentencesNum = 2
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    # print('摘要为:')
    ss = []
    for item in tr4s.get_key_sentences(num=sentencesNum):
        # 打印句子的索引、权重和内容
        # print(item.index, item.weight, item.sentence)
        ss.append(item.sentence)
    if lists:
        return ss       #n个句子列表
    return "。".join(ss)  #默认一个字符串
コード例 #11
0
 def process(self, ):
     for i in self.text:
         self.article += i.getText() + '\n'
     self.article = self.article.strip()
     keywords = []
     abstract = []
     ##关键词
     tr4w = TextRank4Keyword()
     tr4w.analyze(text=self.article, lower=True, window=2)
     for item in tr4w.get_keywords(4, word_min_len=1):
         keywords.append(item.word)
     ##摘要
     tr4s = TextRank4Sentence()
     tr4s.analyze(text=self.article, lower=True, source = 'all_filters')
     for item in tr4s.get_key_sentences(num=3):
         abstract.append(item.sentence)
     return keywords, abstract
コード例 #12
0
ファイル: csv_process.py プロジェクト: linyuefeng123/ssf_uda
def dataframe_keyword_extraction(data):
    mod = TextRank4Sentence()
    label_index = data.columns.get_loc('content')
    for i in range(len(data)):
        res = ""
        print("处理第{}条数据".format(i))
        content = data.iloc[i]['content']
        if (len(content) < 300):
            continue
        mod.analyze(text=content, lower=False, source='all_filters')

        for str in mod.get_key_sentences(num=3):
            if len(res) < 256:
                res += str.sentence
        print(res)
        data.iloc[i, label_index] = res
    return data
コード例 #13
0
def parse_abstract_textrank(text):
    import sys
    try:
        reload(sys)
        sys.setdefaultencoding('utf-8')
    except:
        pass

    from textrank4zh import TextRank4Keyword, TextRank4Sentence

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')

    res = ""
    for item in tr4s.get_key_sentences(num=3):
        res += item.sentence  # index是语句在文本中位置,weight是权重
    return res
コード例 #14
0
def ks_plot(text, number):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text, lower=True, source='no_stop_words')
    data = pd.DataFrame(data=tr4s.key_sentences)
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.plot(data['weight'], 'ro-', lw=2, ms=5, alpha=0.7)
    plt.grid(b=True)
    plt.xlabel(u'句子', fontsize=14)
    plt.ylabel(u'重要度', fontsize=14)
    plt.title(u'句子的重要度曲线', fontsize=18)
    plt.show()

    key_sentences = tr4s.get_key_sentences(num=number, sentence_min_len=4)
    for sentence in key_sentences:
        print(sentence['weight'], sentence['sentence'])
コード例 #15
0
def cal_rouge_textRank(article_lst, summ_lst):
    rouge = Rouge()
    tr4s = TextRank4Sentence()

    rouges = np.zeros((3, 3))
    cnt = 0
    for article, summ in zip(*(article_lst, summ_lst)):
        tr4s.analyze(text=article, lower=True, source='all_filters')

        keysentence_list = list()
        for item in tr4s.get_key_sentences(num=10):
            s = ''.join(item.sentence)
            s = re.sub("\d{4,}", '', s)
            keysentence_list.append(s)
        hyps = ""
        for j, sentence in enumerate(keysentence_list):
            if j == 0 and len(sentence) > 60:
                hyps = sentence[:60]
                break
            if (len(hyps) + len(sentence)) <= 60:
                hyps += sentence
            else:
                break

        hyps = clean(hyps)
        summ = summ.strip()
        summ = clean(summ)
        summ_ids, hyps_ids = word_for_rouge(summ, hyps)
        rouge_score = rouge.get_scores(" ".join(hyps_ids)[:len(summ)],
                                       " ".join(summ_ids))

        rouge1 = rouge_score[0]["rouge-1"]
        rouge2 = rouge_score[0]["rouge-2"]
        rougel = rouge_score[0]["rouge-l"]

        rouges[0] += np.array(list(rouge1.values()))
        rouges[1] += np.array(list(rouge2.values()))
        rouges[2] += np.array(list(rougel.values()))
        cnt += 1

    rouges = rouges / cnt
    print("Rouge: Rouge-1 : F P R")
    print("Rouge: Rouge-2 : F P R")
    print("Rouge: Rouge-L : F P R")
    print(rouges)
コード例 #16
0
ファイル: single_pass_cluster.py プロジェクト: lbrb/hasika_ml
    def fit_transform(self, theta=0.5):
        datMat = self.loadData(self.data)
        word_segmentation = []
        for i in range(len(datMat)):
            word_segmentation.append(self.word_segment(datMat[i]))
        print(
            "............................................................................................"
        )
        print('文本已经分词完毕 !')

        # 得到文本数据的空间向量表示
        corpus_tfidf = self.get_Tfidf_vector_representation(word_segmentation)
        # corpus_tfidf =  self.get_Doc2vec_vector_representation(word_segmentation)
        dictTopic, clusterTopic = self.single_pass(corpus_tfidf, datMat, theta)
        print(
            "............................................................................................"
        )
        print("得到的主题数量有: {} 个 ...".format(len(dictTopic)))
        print(
            "............................................................................................\n"
        )
        # 按聚类语句数量对主题进行排序,找到重要的聚类群
        clusterTopic_list = sorted(clusterTopic.items(),
                                   key=lambda x: len(x[1]),
                                   reverse=True)
        for k in clusterTopic_list[:30]:
            cluster_title = '\n'.join(k[1])
            # print(''.join(cluster_title))
            # 得到每个聚类中的的主题关键词
            word = TextRank4Keyword()
            word.analyze(''.join(self.word_segment(''.join(cluster_title))),
                         window=5,
                         lower=True)
            w_list = word.get_keywords(num=10, word_min_len=2)
            sentence = TextRank4Sentence()
            sentence.analyze('\n'.join(k[1]), lower=True)
            s_list = sentence.get_key_sentences(num=3, sentence_min_len=5)[:30]
            print("【主题索引】:{} \n【主题声量】:{} \n【主题关键词】: {} \n【主题中心句】 :\n{}".format(
                k[0], len(k[1]), ','.join([i.word for i in w_list]),
                '\n'.join([i.sentence for i in s_list])))
            print('\n')
            print("【标题】:", '\n'.join([content[:20] for content in k[1]]))
            print(
                "-------------------------------------------------------------------------"
            )
コード例 #17
0
def sina_keyword_abstract(article, keywords_len, sentences_len):
    # 抽取关键词
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=1):
        keywords.append(item.word)
    # 抽取摘要
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        if str(item.sentence).startswith('原标题') or str(item.sentence).startswith('责任编辑') or str(
                item.sentence).startswith('来源'):
            continue
        abstract.append(item.sentence + '。')
    abstract = '\n'.join(abstract)
    return keywords, abstract
コード例 #18
0
def text_keyword_abstract(article, keywords_len, sentences_len):
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=2):
        keywords.append(item.word)
    keywords = ' '.join(keywords)

    sentences = article.split('.')
    first_sentence = sentences[0]
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        if item.sentence != first_sentence:
            abstract.append(item.sentence + '.')
    abstract = '\n'.join(abstract)
    return keywords  #, abstract
コード例 #19
0
def key_text(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    import_sentence = []
    for item in tr4s.get_key_sentences(num=3):  # sentence_num是生成关键句的个数
        # index是语句在文本中位置,weight表示权重
        # print('item.index, item.weight, item.sentence',item.index, item.weight, item.sentence)
        # self.import_sentence.append(str(item.index) + ' ' +item.sentence)
        import_sentence.append(item.sentence)
    # key_sentence = [i.sentence for i in tr4s.get_key_sentences(num=3)] # num生成关键句的个数
    # 核心内容是 标题 + top-3文本本体关键句
    key_sentences = import_sentence[0][2:] + import_sentence[1][
        2:] + import_sentence[2][2:]
    # 过长文本截断
    if len(key_sentences) < 512:
        return key_sentences
    else:
        return key_sentences[:512]
コード例 #20
0
        def nlp(contents):
            tr4w = TextRank4Keyword()
            tr4w.analyze(text=''.join(i for i in contents),
                         lower=True,
                         window=2)

            tr4s = TextRank4Sentence()
            tr4s.analyze(text=''.join(i for i in contents),
                         lower=True,
                         source='all_filters')

            keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)]
            keyphase = [
                item for item in tr4w.get_keyphrases(keywords_num=20,
                                                     min_occur_num=2)
            ]
            keysentence = [item for item in tr4s.get_key_sentences(num=3)]
            return keyword, keyphase, keysentence
コード例 #21
0
def cctv_abstract(date):
    """从CCTV新闻联播当天的内容中抽取10个句子作为摘要

    :param date: str
        日期,如:20181222
    :return: str
    """
    news = ts_pro.cctv_news(date=date)
    contents = "".join(list(news['content']))

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=contents, lower=True, source='all_filters')

    abstract = []
    for i, item in enumerate(tr4s.get_key_sentences(num=10), 1):
        abstract.append("(%i) %s。\n" % (i, item.sentence))

    return "".join(abstract)
コード例 #22
0
ファイル: event.py プロジェクト: yakiyang/literature_project1
def get_event(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')

    for item in tr4s.get_key_sentences(num=len(tr4s.sentences)/2):
        if '說' not in item.sentence:
            index_list.append(item.index)
        #print(item.index, item.weight, item.sentence)
    index_list.sort()

    for i in index_list:
        #print(tr4w.sentences[i])
        sentence_list.append(tr4s.sentences[i])


    for i in index_list:
        s = tr4s.sentences[i]
        p = parse_tree(s)
        tmp_list = []
        for i in range(len(p)):
            if p[i][6] != 'N':
                tmp_list.append((p[i], s))
        parse_list.append(tmp_list)


    for list in parse_list:
        for s in list:
            index = get_index(s[0])
            index.sort()
            event = ''
            for i in index:
                for j in range(i, len(s[0])):
                    if s[0][j] == ':':
                        start = j+1
                    if s[0][j] == '|' or s[0][j] == ')':
                        end = j
                        break

                if s[0][start:end] != '說':
                    event += s[0][start:end]
            if len(event) != 0:
                events.append(event)

    return events
コード例 #23
0
def summary_candidate_fin(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')

    #設定只保留中文、英文、數字(去掉韓語日語德語,也會去掉表情符號等等)
    #reference: https://zhuanlan.zhihu.com/p/84625185
    rule = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]")

    #print( '摘要:' )
    tt = []
    for i, item in enumerate(tr4s.get_key_sentences(num=3,
                                                    sentence_min_len=80)):

        #print('第{}順位,利用textrank的第一次摘要: '.format(i+1))
        #print(item.index, item.weight, item.sentence)

        s = SnowNLP(item.sentence)
        #print('利用snownlp再取一次的結果: ')
        secnd_sn = s.summary(3)
        #print(secnd_sn)
        for cont in secnd_sn:
            ttt = rule.sub(' ', str(cont))
            if len(ttt.split(' ')) < 3 and len(ttt) > 12:
                tt.append(ttt)
        #print(' ')

    s = SnowNLP(text)
    #print('直接使用snownlp的摘要: ')
    word = {}
    first_sn = s.summary(3)
    for cont in first_sn:
        ttt = rule.sub(' ', str(cont))
        if len(ttt.split(' ')) < 3 and len(ttt) > 12:
            if word.get(ttt) == None:
                word[ttt] = 1
                tt.append(ttt)
    #print(first_sn)
    #print(' ')

    if len(tt) == 0:
        print('無適合的標題')
        tt.append("無適合的標題")
        return tt
    return tt
コード例 #24
0
 def get_summary(self, data, flag=0):
     text = "".join(data)
     if flag == 0:
         tr4w = TextRank4Keyword()
         tr4w.analyze(text=text, lower=True, window=2)
         #ret = tr4w.get_keywords()
         ret = tr4w.get_keyphrases(keywords_num=12, min_occur_num=0)
         if len(ret) > 0:
             return ret[0]
         else:
             return ""
     else:
         tr4s = TextRank4Sentence()
         tr4s.analyze(text=text, lower=True)
         ret = tr4s.get_key_sentences(num=6, sentence_min_len=4)
         if len(ret) >= 0:
             return ret[0]['sentence']
         else:
             return ""
コード例 #25
0
ファイル: pipelines.py プロジェクト: wlj961012/MuseumData
    def process_introduce(self, introduce):

        introduce = introduce.replace('\n', '').replace('\r', '').replace(
            '\t', '').replace('\xa0', '').replace('\u3000',
                                                  '').replace(' ', '')
        tr4s = TextRank4Sentence()
        tr4s.analyze(text=introduce, lower=True, source='all_filters')
        l = len(tr4s.get_key_sentences())
        num_sentences = 3
        sentences = tr4s.get_key_sentences(num=num_sentences)
        sentences = sorted(sentences, key=lambda x: x.index, reverse=False)
        news = []
        if l >= num_sentences:
            for i in range(num_sentences):
                news.append(sentences[i].sentence)
        else:
            for i in range(l):
                news.append(sentences[i].sentence)
        return ''.join(news)
コード例 #26
0
ファイル: action_4.py プロジェクト: IELBHJY/RS6
def action_two():
    import pandas as pd
    from textrank4zh import TextRank4Keyword, TextRank4Sentence
    news = pd.read_table('textrank/news.txt', encoding='GB18030', header=None)
    strings = ''
    for index in range(news.shape[0]):
        strings += news.loc[index, 0]
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=strings, lower=True, window=3)
    print('关键词:')
    for item in tr4w.get_keywords(20, word_min_len=2):
        print(item.word, item.weight)

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=strings, lower=True, source='all_filters')
    print('摘要:')
    # 重要性较高的三个句子
    for item in tr4s.get_key_sentences(num=3):
        print(item.weight, item.sentence)
コード例 #27
0
def Keyword():
    text = codecs.open('/Users/liamtheron/Desktop/Deloiite/test.txt',
                       'r',
                       encoding='utf-8').read()
    tr4w = TextRank4Keyword()
    tr4s = TextRank4Sentence()
    tr4w.analyze(text=text, lower=True, window=2)
    tr4s.analyze(text=text, lower=True)
    print('<关键词>:')
    for item in tr4w.get_keywords(20, word_min_len=1):
        print(item.word, item.weight)
    print()
    print('<关键短语>:')
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        print(phrase)
    print()
    print('<摘要>:')
    for item in tr4s.get_key_sentences(num=3):
        print(item.index, item.weight, item.sentence)
コード例 #28
0
def textSummary(fileName, finalName):
    text = codecs.open(fileName, 'r', 'utf-8').read()
    tr4w = TextRank4Keyword()

    tr4w.analyze(
        text=text, lower=True, window=2
    )  # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    #   关键词
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    sentences = {}
    for item in tr4s.get_key_sentences(num=50):
        if len(item.sentence) not in sentences.keys():
            sentences[len(item.sentence)] = []
            sentences[len(item.sentence)].append(item.sentence)
            continue
        sentences[len(item.sentence)].append(item.sentence)

    key_sentences = []
    for i in range(3):
        key = max(sentences.keys())
        #         print(sentences[key])
        key_sentences.append(sentences[key][0])
        sentences.pop(key)

    text = re.split(
        pattern=
        r'[\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee]',
        string=text)

    ks = []
    for te in text:
        for k in key_sentences:
            if str(te) in str(k) and str(k) not in ks:
                ks.append(k)

    line = ''
    for k in ks:
        line += k + " "

    fw = codecs.open(finalName + "output.txt", 'a', 'utf-8')
    fw.write(line)
コード例 #29
0
def get_textrank4zh_summarization(contents):
    """
    获取文本摘要
    :param contents: string
    :return: dict of list [{x},{x}]
    """
    # 定义返回前5个文本摘要
    topK = 5
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=contents, lower=True, source='all_filters')

    # logger.info('使用textrank4zh提取摘要,默认提取5个')

    # print('摘要:')
    # for item in tr4s.get_key_sentences(num=5):
    #     print('文本位置:{}, 权重:{},内容:{}'.format(item.index, item.weight, item.sentence))  # index是语句在文本中位置,weight是权重

    result = tr4s.get_key_sentences(num=topK)

    return result
コード例 #30
0
 def extract_key_information(self,
                             num_key_word=30,
                             num_key_phrase=20,
                             num_key_sentence=5):
     text = ''.join(self.article_list)
     # 创建分词类的实例
     tr4w = TextRank4Keyword()
     # 对文本进行分析,设定窗口大小为2,并将英文单词小写
     tr4w.analyze(text=text, lower=True, window=2)
     with open(self.rule_reference_filename, "a") as f:
         # 从关键词列表中获取前20个关键词
         f.write(
             '###########################关 键 词##################################'
             + '\n')
         for item in tr4w.get_keywords(num=num_key_word, word_min_len=1):
             if item.word in self.stopwords or item.word in self.filter_dictionary:
                 continue
             else:
                 f.write(item.word + '\t' + str(item.weight) + '\n')
     with open(self.rule_reference_filename, "a") as f:
         # 从关键短语列表中获取20个关键短语
         f.write(
             '##########################关 键 短 语##################################'
             + '\n')
         for phrase in tr4w.get_keyphrases(keywords_num=num_key_phrase,
                                           min_occur_num=2):
             f.write(phrase + '\n')
     # 创建分句类的实例
     tr4s = TextRank4Sentence()
     # 英文单词小写,进行词性过滤并剔除停用词
     tr4s.analyze(text=text, lower=True, source='all_filters')
     with open(self.rule_reference_filename, "a") as f:
         # 从关键短语列表中获取5关键短语
         f.write(
             '###########################关 键 句##################################'
             + '\n')
         for item in tr4s.get_key_sentences(num=num_key_sentence):
             f.write(
                 str(item.index) + str(item.weight) + str(item.sentence) +
                 '\n')
             f.write('----------------' + '\n')