def process_prepare_data(singer_id): # 找出歌手所有歌曲 all_songs = db.scan(0, match="song:*", count=10000) target_songs = [] total_score = 0 emotion_list = [] words_count = Counter() maker_count = Counter() for song in all_songs[1]: sid = db.hget(song, "singer_id") if sid.decode("utf-8") == str(singer_id): target_songs.append(song) # 取得所有歌曲并排序 sorted_songs = [(song, int(db.hget(song, "comment_count").decode("utf-8"))) for song in target_songs] sorted_songs.sort(key=lambda x: x[1], reverse=True) # log("最热度的前20首歌曲为:{}".format( # [db.hget(song[0], "name").decode("utf-8").replace("\n", "") for song in sorted_songs[:20]])) # log("最冷门的前20首歌曲为:{}".format( # [db.hget(song[0], "name").decode("utf-8").replace("\n", "") for song in sorted_songs[-20:]])) for song in sorted_songs: # 歌词处理 song: (song_id, comment_count) text, maker_info = merge_lyric_text(song[0]) # 歌词的情绪分析 song_name = db.hget(song[0], "name").decode("utf-8") song_score = db.get(b'score:' + song[0]) if song_score is None: song_score = process_emotion(text) db.set(b'score:' + song[0], song_score) log("歌曲 {} 未发现情绪分,已计算后存入Redis".format(song_name.replace("\n", ""))) total_score += float(song_score) emotion_list.append((song_name, float(song_score))) # log("正在处理第{}首歌曲,{}的情绪分为{:.2f}".format(sorted_songs.index(song) + 1, song_name.replace("\n", ""), # float(song_score))) for line in maker_info.split("\n"): if "作词" in line: line = SnowNLP(line).han if ":" in line: maker_count[line.split(":")[1].strip()] += 1 else: maker_count[line.split(":")[1].strip()] += 1 # 词频统计 words_count.update(process_frequency(text)) maker_analysis(maker_info) # ----图表部分---- # 歌词情绪分析 emotion_bar = Bar() emotion_x_data = [] emotion_y_data = [] for item in sorted(emotion_list, key=lambda x: x[1], reverse=True)[:20]: emotion_x_data.append(item[0]) emotion_y_data.append(round(item[1], 3)) emotion_bar.add_xaxis(emotion_x_data) emotion_bar.add_yaxis("情绪分值", emotion_y_data) emotion_bar.set_global_opts(title_opts=opts.TitleOpts( title="歌词情绪好的前20首歌曲")) emotion_bar.render("[歌手id-{}]歌词情绪好的前20首歌曲.html".format(singer_id)) # 作词人统计 maker_pie = Pie() maker_data = [] for name, times in maker_count.most_common(10): maker_data.append((name, times)) maker_pie.add("出现次数", maker_data) maker_pie.set_global_opts(title_opts=opts.TitleOpts(title="合作次数最多的作词人前十名", pos_top="8%"), legend_opts=opts.LegendOpts(pos_left="15%")) maker_pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{d}%")) maker_pie.render("[歌手id-{}]合作次数最多的作词人前十名.html".format(singer_id)) # 歌词高频词语 words_bar = Bar() word_x_data = [] word_y_data = [] for word, count in words_count.most_common(20): word_x_data.append(word) word_y_data.append(count) words_bar.add_xaxis(word_x_data) words_bar.add_yaxis("出现次数", word_y_data, category_gap="25%") words_bar.set_global_opts(title_opts=opts.TitleOpts(title="歌词中高频出现的前20个词")) words_bar.render("[歌手id-{}]歌词中重复出现的前20个词.html".format(singer_id)) # 评论热门歌曲TOP30 hot_line = Line() x_data = [] y_data = [] for song in sorted_songs[:20]: x_data.append(db.hget(song[0], "name")) y_data.append(song[1]) hot_line.add_xaxis(x_data) hot_line.add_yaxis("评论数", y_data) hot_line.set_global_opts(title_opts=opts.TitleOpts(title="评论最火热的前20首歌曲")) hot_line.render("[歌手id-{}]热门歌曲TOP20.html".format(singer_id)) # 评论冷门歌曲TOP30 # cool_line = Line() # x_data = [] # y_data = [] # for song in sorted_songs[-20:]: # x_data.append(db.hget(song[0], "name")) # y_data.append(song[1]) # cool_line.add_xaxis(x_data) # cool_line.add_yaxis("评论数", y_data) # cool_line.set_global_opts(title_opts=opts.TitleOpts(title="评论冷清歌曲前20首")) # cool_line.render("[歌手id-{}]冷门歌曲TOP20.html".format(singer_id)) return
query_in_char = [] query_in_word = [] query_in_char_set = [] query_in_word_set = [] ques_mark = [] for i in range(0, len(lines)): l = lines[i] ge = json.loads(l) que = SnowNLP(ge.get('query', '')).han query.append(que) query_id.append(ge.get('query_id', '')) pas = SnowNLP(ge.get('passage', '')).han try: pas = pas.split('?')[-1].strip() except: pass pas = pas.replace(que, '') if len(pas) < 2: pas = SnowNLP(ge.get('passage', '')).han print(ge.get('passage', '')) print(ge.get('query', '')) print(ge.get('answer', '')) print('---') passage.append(pas) answer.append(ge.get('answer', '')) gege = clear_alternatives(ge.get('alternatives', '')) alternatives.append(gege)