コード例 #1
0
def text_keyword_abstract(article, keywords_len, sentences_len):

    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=2):
        keywords.append(item.word)
    keywords = ' '.join(keywords)

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        abstract.append(item.sentence)
    return keywords, abstract
コード例 #2
0
 def keyword(self):
     keyword1 = jieba.analyse.textrank(self.cut, topK=5, withWeight=True)
     keyword2 = jieba.analyse.extract_tags(self.cut,
                                           topK=5,
                                           withWeight=True)
     print('Key word for jieba_textrank:')
     for m in keyword1:
         print(m[0], m[1])
     print('\nKey word for jieba_TF-IDF:')
     for n in keyword2:
         print(n[0], n[1])
     print('\nKey word for textrank4zh:')
     keyword3 = TextRank4Keyword()
     keyword3.analyze(text=self.cut, lower=True, window=2)
     for i in keyword3.get_keywords(5):
         print(i.word, i.weight)
def key_word_extraction(articles):
    key_word = []
    num_key_word = 5
    tr4w = TextRank4Keyword()
    for i in range(len(articles)):
        temp_key_word = []
        tr4w.analyze(text=articles[i],
                     lower=True,
                     window=3,
                     vertex_source='all_filters',
                     edge_source='all_filters')
        for j in range(num_key_word):
            temp_key_word.append(
                tr4w.get_keywords(num=num_key_word)[j]['word'])
        key_word.append(temp_key_word)
    return key_word
コード例 #4
0
ファイル: getKeyWords.py プロジェクト: lyz21/MachineLearning
def getKeyWordsByTitle(max_num=10, startColum=None, endColum=None):
    # 每天最多的关键词个数
    # max_num = 7

    # pd显示所有行、列
    pd.set_option('display.max_rows', None)  # 行
    pd.set_option('display.max_columns', None)  # 列
    # 加载需要的keyword
    keyword = pd.read_csv('../data/keyword_title.csv')['word'].values.tolist()
    # 加载title
    data_arr = pd.read_csv('../data/RenMinTitle_all_2020_04_06.csv').iloc[
        startColum:endColum, :30].values
    # 每天化为1条,转为1维数据
    data_list = []
    for arr in data_arr:
        data_list.append(' '.join(arr))
    print(data_list)
    # 生成日期列表
    # date_list = netUtil.getDateList(len(data_list))
    # 构建二维矩阵,按天存储关键词
    keywords_allday_list = []
    # 存储所有的关键词,用来构建连接矩阵
    keywords_all_set = set()
    # 获取每天的关键词
    for day_title in data_list:
        day_keywords_list = []
        words = TextRank4Keyword()
        words.analyze(text=day_title, lower=True, window=3)
        keywords_dic_list = words.get_keywords(40, word_min_len=2)
        for one_dic in keywords_dic_list:
            # 下面注释是为了挑选出名词
            # s_tags = SnowNLP(one_dic['word']).tags
            # print('---')
            # flag = 0
            # for tag in s_tags:
            #     if tag[1] == 'n':
            #         flag = 1
            #         print(tag)
            # if one_dic['word'] in keyword:
            #     if flag == 1:
            if one_dic['word'] in keyword:
                day_keywords_list.append(one_dic['word'])
                keywords_all_set.add(one_dic['word'])
                if len(day_keywords_list) >= max_num:
                    break
        keywords_allday_list.append(day_keywords_list)
    return keywords_allday_list
コード例 #5
0
 def process(self, ):
     for i in self.text:
         self.article += i.getText() + '\n'
     self.article = self.article.strip()
     keywords = []
     abstract = []
     ##关键词
     tr4w = TextRank4Keyword()
     tr4w.analyze(text=self.article, lower=True, window=2)
     for item in tr4w.get_keywords(4, word_min_len=1):
         keywords.append(item.word)
     ##摘要
     tr4s = TextRank4Sentence()
     tr4s.analyze(text=self.article, lower=True, source = 'all_filters')
     for item in tr4s.get_key_sentences(num=3):
         abstract.append(item.sentence)
     return keywords, abstract
コード例 #6
0
ファイル: textrank.py プロジェクト: Sixy1204/bda2020-spring
def compute_texttank(df, filepath):
    writer = pd.ExcelWriter(filepath)
    for names in df:

        all_re = []
        test_df = df[names]
        text_all = concat_all(df=test_df[:5])  # 原本test_df為了demo改為test_df[:5]
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=text_all, lower=True, window=6)
        for item in tr4w.get_keywords(num=100, word_min_len=2):
            all_re.append([item.word, item.weight])

        df_result = pd.DataFrame(all_re)
        df_result.columns = ['關鍵詞', 'Textrank分數']
        df_result.to_excel(writer, sheet_name=names)

    writer.save()
コード例 #7
0
ファイル: trad_test.py プロジェクト: joyce99/XieYuxin
def get_textRank_kp(file_name, topk):
    json_file = open(file_name, 'r', encoding='utf-8')
    textRank_kp = []
    for line in json_file.readlines():
        json_data = json.loads(line)
        cur_content = json_data['title'].strip().lower(
        ) + ' ' + json_data['abstract'].strip().lower()
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=cur_content, lower=True, window=2)
        keywords_list = []
        for item in tr4w.get_keywords(topk, word_min_len=1):
            keywords_list.append(item.word)
        kp_list = get_kp(cur_content, keywords_list)
        # textRank_kp = tr4w.get_keyphrases(keywords_num=20, min_occur_num=2)
        textRank_kp.append(kp_list)
    json_file.close()
    return textRank_kp
コード例 #8
0
ファイル: abstract.py プロジェクト: xzb123/read_parse_storage
def getKeyword(text, keywords_num=20):
    """对text提取关键词,关键词个数为keywords_num, 关键短语也顺便提取。
    关键短语在文中的最少出现次数为 phrase_min_num=5,没有再减。
    返回关键字的列表(item.word, item.weight)和关键短语列表
    """
    tr4w = TextRank4Keyword()
    # 对文本进行分析,设定窗口大小为2,并将英文单词小写
    tr4w.analyze(text=text, lower=True, window=2)
    kw = tr4w.get_keywords(num=keywords_num, word_min_len=1)
    mon = 5
    while(mon>0):
        kp = tr4w.get_keyphrases(keywords_num=keywords_num, min_occur_num=mon)
        if len(kp) > 0:
            print(mon)
            break
        mon -= 1
    return kw, kp
コード例 #9
0
ファイル: wordUtil.py プロジェクト: lyz21/Project200723
def get_words_list(path='../data/countryComment.csv'):
    tr4w = TextRank4Keyword()
    # 获取评论
    comments = pd.read_csv(path)['评论内容'].values
    # 划分分词
    word_list = []
    for comment in comments:
        tr4w.analyze(text=comment)
        words = tr4w.words_all_filters
        words = sum(words, [])  # 二维转一维
        word_list.append(words)
        print(word_list)
    words = sum(word_list, [])
    words = pd.DataFrame(words)
    words.to_csv('../data/words.csv', index=False)
    print('words.csv文件保存成功!')
    return words
コード例 #10
0
 def generator(self,text):
     tr4w = TextRank4Keyword(stop_words_file='/home/ubuntu/HITChat/stopword.txt')  # 导入停止词
     #使用词性过滤,文本小,窗口为3
     tr4w.train(text=text, speech_tag_filter=False, lower=True, window=3)
     key = []
     origin = []
     with open("/home/ubuntu/HITChat/userarticle.txt","r") as f:
         for line in f.readlines():
             origin.append(line.strip())
     for word in tr4w.get_keywords(10, word_min_len=2):
         key.append(word)
     if len(origin) <= 200:
         with open("/home/ubuntu/HITChat/userarticle.txt","a") as f:
             for i in range(len(key)):
                 if key[i] not in origin:
                     f.write(key[i]+"\n")
     return key
コード例 #11
0
def get_chinese_keywords(text):
    tr4w = TextRank4Keyword()

    tr4w.analyze(text=text, lower=True, window=5)

    keywords_ = []
    for item in tr4w.get_keywords(10, word_min_len=2):
        # print(item.word, item.weight)
        keywords_.append(item.word)

    # print()
    # print( '关键短语:' )
    keyphrases = []
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        keyphrases.append(phrase)

    return keywords_, keyphrases
コード例 #12
0
ファイル: single_pass_cluster.py プロジェクト: lbrb/hasika_ml
    def fit_transform(self, theta=0.5):
        datMat = self.loadData(self.data)
        word_segmentation = []
        for i in range(len(datMat)):
            word_segmentation.append(self.word_segment(datMat[i]))
        print(
            "............................................................................................"
        )
        print('文本已经分词完毕 !')

        # 得到文本数据的空间向量表示
        corpus_tfidf = self.get_Tfidf_vector_representation(word_segmentation)
        # corpus_tfidf =  self.get_Doc2vec_vector_representation(word_segmentation)
        dictTopic, clusterTopic = self.single_pass(corpus_tfidf, datMat, theta)
        print(
            "............................................................................................"
        )
        print("得到的主题数量有: {} 个 ...".format(len(dictTopic)))
        print(
            "............................................................................................\n"
        )
        # 按聚类语句数量对主题进行排序,找到重要的聚类群
        clusterTopic_list = sorted(clusterTopic.items(),
                                   key=lambda x: len(x[1]),
                                   reverse=True)
        for k in clusterTopic_list[:30]:
            cluster_title = '\n'.join(k[1])
            # print(''.join(cluster_title))
            # 得到每个聚类中的的主题关键词
            word = TextRank4Keyword()
            word.analyze(''.join(self.word_segment(''.join(cluster_title))),
                         window=5,
                         lower=True)
            w_list = word.get_keywords(num=10, word_min_len=2)
            sentence = TextRank4Sentence()
            sentence.analyze('\n'.join(k[1]), lower=True)
            s_list = sentence.get_key_sentences(num=3, sentence_min_len=5)[:30]
            print("【主题索引】:{} \n【主题声量】:{} \n【主题关键词】: {} \n【主题中心句】 :\n{}".format(
                k[0], len(k[1]), ','.join([i.word for i in w_list]),
                '\n'.join([i.sentence for i in s_list])))
            print('\n')
            print("【标题】:", '\n'.join([content[:20] for content in k[1]]))
            print(
                "-------------------------------------------------------------------------"
            )
コード例 #13
0
def text_keyword_abstract(article, keywords_len, sentences_len):
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=2):
        keywords.append(item.word)
    keywords = ' '.join(keywords)

    sentences = article.split('.')
    first_sentence = sentences[0]
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        if item.sentence != first_sentence:
            abstract.append(item.sentence + '.')
    abstract = '\n'.join(abstract)
    return keywords  #, abstract
コード例 #14
0
def keywords_extraction(text, num, word_min_len):
    tr4w = TextRank4Keyword(allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz'])
    # allow_speech_tags   --词性列表,用于过滤某些词性的词
    tr4w.analyze(text=text, window=2, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
                 pagerank_config={'alpha': 0.85, })
    # text    --  文本内容,字符串
    # window  --  窗口大小,int,用来构造单词之间的边。默认值为2
    # lower   --  是否将英文文本转换为小写,默认值为False
    # vertex_source  -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点
    #                -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'
    # edge_source  -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边
    #              -- 默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数

    # pagerank_config  -- pagerank算法参数配置,阻尼系数为0.85
    keywords = tr4w.get_keywords(num=num, word_min_len=word_min_len)
    # num           --  返回关键词数量
    # word_min_len  --  词的最小长度,默认值为1
    return keywords
コード例 #15
0
        def nlp(contents):
            tr4w = TextRank4Keyword()
            tr4w.analyze(text=''.join(i for i in contents),
                         lower=True,
                         window=2)

            tr4s = TextRank4Sentence()
            tr4s.analyze(text=''.join(i for i in contents),
                         lower=True,
                         source='all_filters')

            keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)]
            keyphase = [
                item for item in tr4w.get_keyphrases(keywords_num=20,
                                                     min_occur_num=2)
            ]
            keysentence = [item for item in tr4s.get_key_sentences(num=3)]
            return keyword, keyphase, keysentence
コード例 #16
0
def sina_keyword_abstract(article, keywords_len, sentences_len):
    # 抽取关键词
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=article, lower=True, window=2)
    keywords = []
    for item in tr4w.get_keywords(keywords_len, word_min_len=1):
        keywords.append(item.word)
    # 抽取摘要
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        if str(item.sentence).startswith('原标题') or str(item.sentence).startswith('责任编辑') or str(
                item.sentence).startswith('来源'):
            continue
        abstract.append(item.sentence + '。')
    abstract = '\n'.join(abstract)
    return keywords, abstract
コード例 #17
0
def contain_word(query, key_query):
    import_html_dir = Path.cwd().parent / 'html' / query
    index_csv_dir = Path.cwd().parent / 'csv' / f'{query}.csv'
    export_dir = Path.cwd().parent / 'category'
    if not export_dir.exists():
        export_dir.mkdir()
    export_html_dir = Path.cwd().parent / 'category' / query
    if not export_html_dir.exists():
        export_html_dir.mkdir()
    index = pd.read_csv(index_csv_dir)

    for key, value in key_query.items():
        with open(os.path.join(f'{key}.csv'), 'w',
                  encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["title", "url"])

    for i in range(len(index)):
        url = index.iloc[i, 2]
        title = index.iloc[i, 1]
        html_file = codecs.open(os.path.join(import_html_dir, f'{title}.html'),
                                'r', 'utf-8').read()
        end = 'media_tool_meta tips_global_primary meta_primary'
        html_file = html_file[:html_file.rfind(end)]
        text = BeautifulSoup(html_file, features="lxml").get_text()  #转化为txt
        tr4w = TextRank4Keyword()
        tr4w.analyze(text=text, lower=True, window=2)
        list_key = []
        for key_word in tr4w.get_keywords(10, word_min_len=2):
            list_key.append(key_word.word)
        textrank = analyse.textrank
        keywords = textrank(text, 10)
        for keyword in keywords:
            list_key.append(keyword)
        list_key = set(list_key)
        print(title, list_key)
        for key, value in key_query.items():
            if not list_key.isdisjoint(value):
                print(title, key)
                with open(os.path.join(export_html_dir, f'{key}.csv'),
                          'a',
                          encoding='utf-8-sig') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerows([(title, url)])
コード例 #18
0
def Keyword():
    text = codecs.open('/Users/liamtheron/Desktop/Deloiite/test.txt',
                       'r',
                       encoding='utf-8').read()
    tr4w = TextRank4Keyword()
    tr4s = TextRank4Sentence()
    tr4w.analyze(text=text, lower=True, window=2)
    tr4s.analyze(text=text, lower=True)
    print('<关键词>:')
    for item in tr4w.get_keywords(20, word_min_len=1):
        print(item.word, item.weight)
    print()
    print('<关键短语>:')
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        print(phrase)
    print()
    print('<摘要>:')
    for item in tr4s.get_key_sentences(num=3):
        print(item.index, item.weight, item.sentence)
コード例 #19
0
def zhihu_exec():
    Qlink = request.form.get('Qlink', '')
    Client = zhihu_oauth.ZhihuClient()
    Client.load_token('token.pkl')
    me = Client.me()
    question = Client.question(int(Qlink))
    with open(os.path.join(os.path.join('projhorus','static'),'question_%s_result.txt' % Qlink),'w') as f:
        for i in question.answers:
            tr4w = TextRank4Keyword()
            tr4w.analyze(text=i.content, lower=True, window=2)
            f.write(u'<--------------------关键词-------------------->\n')
            for item in tr4w.get_keywords(20, word_min_len=1):
                f.write(str(item.word) +'  '+ str(item.weight)+'\n')
            f.write(u'<--------------------关键短语-------------------->\n')
            for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
                f.write(phrase+'\n')
    global FLAG
    FLAG = '/static/question_%s_result.txt' % Qlink
    return redirect('/')
コード例 #20
0
def textrank_keyWords(cat):
    df_u8 = pd.read_csv('contents/' + cat + '_u8.csv', encoding="utf_8_sig")
    df_gb = pd.read_csv('contents/' + cat + '_gb.csv', encoding="GB2312")

    content = []
    for i in range(len(df_gb['content'])):
        tempStr = str(df_gb['title'][i]).strip() + ' ' + str(
            df_gb['content'][i]).strip()
        content.append(tempStr)
    for i in range(len(df_u8['content'])):
        content.append(
            str(df_u8['title'][i]).strip() + ' ' +
            str(df_u8['content'][i]).strip())

    words = {}
    tr4w = TextRank4Keyword(stop_words_file='stopWords.txt')
    randomList = []
    for i in range(500):
        randomList.append(random.randint(0, len(content) - 1))

    for i in randomList:
        tr4w.analyze(text=content[i], lower=True, window=2)
        for item in tr4w.get_keywords(5, word_min_len=1):
            if item.word not in stopList and item.word not in specStop:
                if item.word not in words.keys():
                    words[item.word] = item.weight
                else:
                    words[item.word] = words[item.word] + item.weight
    sorted_by_value = sorted(words.items(), key=lambda kv: kv[1], reverse=True)
    #print(cat + ':')
    tags = []
    frequency = []
    for l in sorted_by_value[0:20]:
        f = l[1]
        w = l[0]
        frequency.append(float(f) * 1000)
        tags.append(w)
    total_text = {}
    for i in range(len(tags)):
        total_text[tags[i]] = int(frequency[i])
    wordcloud = WordCloud(font_path='C:/WINDOWS/Fonts/STKAITI.TTF',
                          background_color='white').fit_words(total_text)
    wordcloud.to_file('wordcloud/' + cat + '_tr.jpg')
コード例 #21
0
ファイル: action_4.py プロジェクト: IELBHJY/RS6
def action_two():
    import pandas as pd
    from textrank4zh import TextRank4Keyword, TextRank4Sentence
    news = pd.read_table('textrank/news.txt', encoding='GB18030', header=None)
    strings = ''
    for index in range(news.shape[0]):
        strings += news.loc[index, 0]
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=strings, lower=True, window=3)
    print('关键词:')
    for item in tr4w.get_keywords(20, word_min_len=2):
        print(item.word, item.weight)

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=strings, lower=True, source='all_filters')
    print('摘要:')
    # 重要性较高的三个句子
    for item in tr4s.get_key_sentences(num=3):
        print(item.weight, item.sentence)
コード例 #22
0
 def get_summary(self, data, flag=0):
     text = "".join(data)
     if flag == 0:
         tr4w = TextRank4Keyword()
         tr4w.analyze(text=text, lower=True, window=2)
         #ret = tr4w.get_keywords()
         ret = tr4w.get_keyphrases(keywords_num=12, min_occur_num=0)
         if len(ret) > 0:
             return ret[0]
         else:
             return ""
     else:
         tr4s = TextRank4Sentence()
         tr4s.analyze(text=text, lower=True)
         ret = tr4s.get_key_sentences(num=6, sentence_min_len=4)
         if len(ret) >= 0:
             return ret[0]['sentence']
         else:
             return ""
コード例 #23
0
def textSummary(fileName, finalName):
    text = codecs.open(fileName, 'r', 'utf-8').read()
    tr4w = TextRank4Keyword()

    tr4w.analyze(
        text=text, lower=True, window=2
    )  # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    #   关键词
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    sentences = {}
    for item in tr4s.get_key_sentences(num=50):
        if len(item.sentence) not in sentences.keys():
            sentences[len(item.sentence)] = []
            sentences[len(item.sentence)].append(item.sentence)
            continue
        sentences[len(item.sentence)].append(item.sentence)

    key_sentences = []
    for i in range(3):
        key = max(sentences.keys())
        #         print(sentences[key])
        key_sentences.append(sentences[key][0])
        sentences.pop(key)

    text = re.split(
        pattern=
        r'[\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee]',
        string=text)

    ks = []
    for te in text:
        for k in key_sentences:
            if str(te) in str(k) and str(k) not in ks:
                ks.append(k)

    line = ''
    for k in ks:
        line += k + " "

    fw = codecs.open(finalName + "output.txt", 'a', 'utf-8')
    fw.write(line)
コード例 #24
0
def get_textrank4zh_keywords_phrase(contents):
    """
    获取文本关键字短语,这个功能有点不完善,不好用
    :param contents: string
    :return: dict of list [{x},{x}]
    """
    # 定义返回前20个关键词短语
    topK = 20
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=contents, lower=True)

    # logger.info('使用textrank4zh提取关键词短语,默认提取20个')

    # print('关键短语:')
    # for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
    #     print(phrase)

    result = tr4w.get_keyphrases(keywords_num=topK, min_occur_num=2)

    return result
コード例 #25
0
def generate_keywords(text, number):
    """
    textrank4zh: 针对中文文本的TextRank算法实现,用于提取关键词和摘要。 https://github.com/letiantian/TextRank4ZH
    pyhanlp: HanLP自然语言处理工具包的python接口,实现TextRank关键词提取算法。 https://github.com/hankcs/pyhanlp
    jieba: jieba分词工具包,实现TF-IDF算法,TextRank算法的关键词抽取。 https://github.com/fxsjy/jieba

    :param text: 文本
    :param number: 关键词个数
    :return: 关键词字典
    """
    keywords = {}
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=text, lower=True, window=2)
    for item in tr4w.get_keywords(number, word_min_len=1):
        keywords.setdefault('textrankzh', []).append(item.word)

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    abstract = ""
    TextRankKeyword = JClass('com.hankcs.hanlp.summary.TextRankKeyword')

    for item in tr4s.get_key_sentences(num=2):
        abstract += item.sentence
    tr4w.analyze(text=abstract, lower=True, window=2)
    for item in tr4w.get_keywords(number, word_min_len=1):
        keywords.setdefault('textrank_abs', []).append(item.word)

    for item in HanLP.extractKeyword(text, number):
        keywords.setdefault('hanlp', []).append(item)
    for item in HanLP.extractKeyword(abstract, number):
        keywords.setdefault('hanlp_abs', []).append(item)

    for item in jieba.analyse.extract_tags(text, topK=number, withWeight=False, allowPOS=()):
        keywords.setdefault('tfidf', []).append(item)
    for item in jieba.analyse.textrank(text, topK=number, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')):
        keywords.setdefault('jiebarank', []).append(item)
    for item in jieba.analyse.extract_tags(abstract, topK=number, withWeight=False, allowPOS=()):
        keywords.setdefault('tfidf_abs', []).append(item)
    for item in jieba.analyse.textrank(abstract, topK=number, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')):
        keywords.setdefault('jiebarank_abs', []).append(item)
    return keywords
コード例 #26
0
 def extract_key_information(self,
                             num_key_word=30,
                             num_key_phrase=20,
                             num_key_sentence=5):
     text = ''.join(self.article_list)
     # 创建分词类的实例
     tr4w = TextRank4Keyword()
     # 对文本进行分析,设定窗口大小为2,并将英文单词小写
     tr4w.analyze(text=text, lower=True, window=2)
     with open(self.rule_reference_filename, "a") as f:
         # 从关键词列表中获取前20个关键词
         f.write(
             '###########################关 键 词##################################'
             + '\n')
         for item in tr4w.get_keywords(num=num_key_word, word_min_len=1):
             if item.word in self.stopwords or item.word in self.filter_dictionary:
                 continue
             else:
                 f.write(item.word + '\t' + str(item.weight) + '\n')
     with open(self.rule_reference_filename, "a") as f:
         # 从关键短语列表中获取20个关键短语
         f.write(
             '##########################关 键 短 语##################################'
             + '\n')
         for phrase in tr4w.get_keyphrases(keywords_num=num_key_phrase,
                                           min_occur_num=2):
             f.write(phrase + '\n')
     # 创建分句类的实例
     tr4s = TextRank4Sentence()
     # 英文单词小写,进行词性过滤并剔除停用词
     tr4s.analyze(text=text, lower=True, source='all_filters')
     with open(self.rule_reference_filename, "a") as f:
         # 从关键短语列表中获取5关键短语
         f.write(
             '###########################关 键 句##################################'
             + '\n')
         for item in tr4s.get_key_sentences(num=num_key_sentence):
             f.write(
                 str(item.index) + str(item.weight) + str(item.sentence) +
                 '\n')
             f.write('----------------' + '\n')
コード例 #27
0
ファイル: TextRank_generate_ppt.py プロジェクト: buaaYYC/ppt
def textRank_ppt(url,num_abs):
    """

    :param url:
    :param num_abs: 生成ppt张数
    :return:
    """
    title,texts = article_extract(url)
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=texts ,lower=True, window=2)   # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=texts, lower=True, source = 'all_filters')
    print( '关键词:' )
    key_words = ""
    for item in tr4w.get_keywords(7, word_min_len=2):
        print(item.word, item.weight)
        key_words = key_words + item.word + "\n"

    #ppt generate
    prs = Presentation()
    slide1, body_shape1 = ppt1.add_slide(prs=prs,slide_title=title, style_number=0)
    slide2, body_shape2 = ppt1.add_slide(prs=prs, style_number=1,slide_title="关键词",content="")
    ppt1.add_paragraph(body_shape2,text=key_words,size=20)
    i = 0
    #图片生成,并添加到ppt中
    extract_image.pic_extract(url)
    print("句子:")
    for item in tr4s.get_key_sentences(num=(num_abs-2)*2):

        if i % 2 == 0:
            slide3, body_shape3 = ppt1.add_slide(prs=prs, style_number=1,slide_title="摘要",content="")
            try:
                ppt1.add_picture(slide2=slide3,pic_path="image1/image_"+str(i)+".jpg")
            except:
                print("no picture")
        i += 1
        # print(len(item.sentence),item.index)
        ppt1.add_paragraph(body_shape3,text=item.sentence,size=20)
    prs.save('test.pptx')
    print("ppt 已生成")
コード例 #28
0
def extractKeywordFromUser(sentence, targetNum):
    textrank = TextRank4Keyword()
    s = sentence
    textrank.analyze(text=s, lower=True, window=2)
    keywords = [
        item.word for item in textrank.get_keywords(targetNum, word_min_len=1)
    ]
    candidatewords = []
    candidatescores = []
    if len(keywords) < targetNum:
        for keyword in keywords:
            wordlist, scorelist = synonyms.nearby(keyword)
            candidatewords.extend(wordlist)
            candidatescores.extend(scorelist)
        sortedIndex = np.argsort([-i for i in candidatescores])
        sortedIndex = [
            sortedIndex[i] for i in range(len(sortedIndex))
            if not candidatescores[sortedIndex[i]] == 1
        ]
        for i in range(targetNum - len(keywords)):
            keywords.append(candidatewords[sortedIndex[i]])
    return keywords
コード例 #29
0
    def main(self, content, label, window=2, k=20):
        """
		The function is used to exract keywords using TextRank.
		@ param `content` String list of texts, which should be segmented string.
		@ param `label` String/Integer list indicating the category.
		@ param `window` count window.
		@ param `k` top k keywords will be returned, default 10.
		"""
        keywords = {}
        cate_content = {}
        for c, l in zip(content, label):
            if l in cate_content:
                cate_content[l] += " " + c.replace(" ", "")
            else:
                cate_content[l] = c.replace(" ", "")
        for item in cate_content.items():
            tr4w = TextRank4Keyword()
            tr4w.analyze(text=item[1], lower=True, window=window)
            kws = tr4w.get_keywords(k, word_min_len=1)
            keywords[item[0]] = [x.word for x in kws]

        return keywords
コード例 #30
0
ファイル: SAL.py プロジェクト: 447428054/WBFL
def finalGet(content, number=5):
    text = content
    tr4w = TextRank4Keyword()
    tr4w.analyze(text=text, lower=True, window=2)
    # 得到五个关键词,能够是用户在列表中通过关键词组来对Note进行辨别
    wordList = []
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        wordList.append(phrase)
    if len(wordList) < number:
        for item in tr4w.get_keywords(number - len(wordList), word_min_len=2):
            wordList.append(item.word)
    print(wordList)
    # 得到一个合起来的摘要
    summary = []
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    for item in tr4s.get_key_sentences(num=3):
        summary.append(item.sentence)
    print(summary)
    sumText = '。'.join(summary)
    print(sumText)
    return (wordList, sumText)