Esempio n. 1
0
def process_prepare_data(singer_id):
    # 找出歌手所有歌曲
    all_songs = db.scan(0, match="song:*", count=10000)
    target_songs = []
    total_score = 0
    emotion_list = []
    words_count = Counter()
    maker_count = Counter()
    for song in all_songs[1]:
        sid = db.hget(song, "singer_id")
        if sid.decode("utf-8") == str(singer_id):
            target_songs.append(song)

    # 取得所有歌曲并排序
    sorted_songs = [(song, int(db.hget(song, "comment_count").decode("utf-8")))
                    for song in target_songs]
    sorted_songs.sort(key=lambda x: x[1], reverse=True)
    # log("最热度的前20首歌曲为:{}".format(
    #     [db.hget(song[0], "name").decode("utf-8").replace("\n", "") for song in sorted_songs[:20]]))
    # log("最冷门的前20首歌曲为:{}".format(
    #     [db.hget(song[0], "name").decode("utf-8").replace("\n", "") for song in sorted_songs[-20:]]))

    for song in sorted_songs:
        # 歌词处理 song: (song_id, comment_count)
        text, maker_info = merge_lyric_text(song[0])
        # 歌词的情绪分析
        song_name = db.hget(song[0], "name").decode("utf-8")
        song_score = db.get(b'score:' + song[0])
        if song_score is None:
            song_score = process_emotion(text)
            db.set(b'score:' + song[0], song_score)
            log("歌曲 {} 未发现情绪分,已计算后存入Redis".format(song_name.replace("\n", "")))
        total_score += float(song_score)
        emotion_list.append((song_name, float(song_score)))
        # log("正在处理第{}首歌曲,{}的情绪分为{:.2f}".format(sorted_songs.index(song) + 1, song_name.replace("\n", ""),
        #                                       float(song_score)))

        for line in maker_info.split("\n"):
            if "作词" in line:
                line = SnowNLP(line).han
                if ":" in line:
                    maker_count[line.split(":")[1].strip()] += 1
                else:
                    maker_count[line.split(":")[1].strip()] += 1
        # 词频统计
        words_count.update(process_frequency(text))
        maker_analysis(maker_info)

    # ----图表部分----

    # 歌词情绪分析
    emotion_bar = Bar()
    emotion_x_data = []
    emotion_y_data = []
    for item in sorted(emotion_list, key=lambda x: x[1], reverse=True)[:20]:
        emotion_x_data.append(item[0])
        emotion_y_data.append(round(item[1], 3))

    emotion_bar.add_xaxis(emotion_x_data)
    emotion_bar.add_yaxis("情绪分值", emotion_y_data)
    emotion_bar.set_global_opts(title_opts=opts.TitleOpts(
        title="歌词情绪好的前20首歌曲"))
    emotion_bar.render("[歌手id-{}]歌词情绪好的前20首歌曲.html".format(singer_id))

    # 作词人统计
    maker_pie = Pie()
    maker_data = []
    for name, times in maker_count.most_common(10):
        maker_data.append((name, times))
    maker_pie.add("出现次数", maker_data)
    maker_pie.set_global_opts(title_opts=opts.TitleOpts(title="合作次数最多的作词人前十名",
                                                        pos_top="8%"),
                              legend_opts=opts.LegendOpts(pos_left="15%"))
    maker_pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{d}%"))
    maker_pie.render("[歌手id-{}]合作次数最多的作词人前十名.html".format(singer_id))

    # 歌词高频词语
    words_bar = Bar()
    word_x_data = []
    word_y_data = []
    for word, count in words_count.most_common(20):
        word_x_data.append(word)
        word_y_data.append(count)

    words_bar.add_xaxis(word_x_data)
    words_bar.add_yaxis("出现次数", word_y_data, category_gap="25%")
    words_bar.set_global_opts(title_opts=opts.TitleOpts(title="歌词中高频出现的前20个词"))
    words_bar.render("[歌手id-{}]歌词中重复出现的前20个词.html".format(singer_id))

    # 评论热门歌曲TOP30
    hot_line = Line()
    x_data = []
    y_data = []
    for song in sorted_songs[:20]:
        x_data.append(db.hget(song[0], "name"))
        y_data.append(song[1])
    hot_line.add_xaxis(x_data)
    hot_line.add_yaxis("评论数", y_data)
    hot_line.set_global_opts(title_opts=opts.TitleOpts(title="评论最火热的前20首歌曲"))
    hot_line.render("[歌手id-{}]热门歌曲TOP20.html".format(singer_id))

    # 评论冷门歌曲TOP30
    # cool_line = Line()
    # x_data = []
    # y_data = []
    # for song in sorted_songs[-20:]:
    #     x_data.append(db.hget(song[0], "name"))
    #     y_data.append(song[1])
    # cool_line.add_xaxis(x_data)
    # cool_line.add_yaxis("评论数", y_data)
    # cool_line.set_global_opts(title_opts=opts.TitleOpts(title="评论冷清歌曲前20首"))
    # cool_line.render("[歌手id-{}]冷门歌曲TOP20.html".format(singer_id))
    return
Esempio n. 2
0
        query_in_char = []
        query_in_word = []
        query_in_char_set = []
        query_in_word_set = []
        ques_mark = []
        for i in range(0, len(lines)):
            l = lines[i]
            ge = json.loads(l)

            que = SnowNLP(ge.get('query', '')).han
            query.append(que)

            query_id.append(ge.get('query_id', ''))
            pas = SnowNLP(ge.get('passage', '')).han
            try:
                pas = pas.split('?')[-1].strip()
            except:
                pass
            pas = pas.replace(que, '')

            if len(pas) < 2:
                pas = SnowNLP(ge.get('passage', '')).han
                print(ge.get('passage', ''))
                print(ge.get('query', ''))
                print(ge.get('answer', ''))
                print('---')
            passage.append(pas)

            answer.append(ge.get('answer', ''))
            gege = clear_alternatives(ge.get('alternatives', ''))
            alternatives.append(gege)