Exemple #1
0
 def save_all_users_info_to_file_by_multi_threading(self,threads = 10):
     '''
     多线程加速保存用户信息到磁盘
     :param threads: 线程数
     '''
     Helper.check_file_exits_and_overwrite(self.users_info_file_path)
     start_time = time.time()
     users_url = self.load_all_users_url()
     num = len(users_url)
     pack = num//threads # 每个线程处理的url数量
     threads_list = []
     # 计数器初始化
     self.no_counter_init()
     for i in range(threads):
         if i < threads-1:
             urls = users_url[i*pack:(i+1)*pack]
         else:
             urls = users_url[i*pack:]
         t = Thread(target = self.save_users_info,args=(urls,num))
         threads_list.append(t)
     for i in range(threads):
         threads_list[i].start()
     for i in range(threads):
         threads_list[i].join()
     end_time = time.time()
     self.logger.info("Using {threads} threads to save users info done,costs {cost_time} seconds"
             .format(threads = threads,cost_time = (end_time - start_time)))
Exemple #2
0
 def draw_wordcloud(self,cutted_words_text,save_path,
                    background_path = None,font_path = None,
                    max_words = 2000,max_font_size = 40,background_color = 'white'):
     '''
     绘制词云,并保存图像到磁盘
     :param cutted_words_text: 已经切分好的,用空格分隔的word 字符串
     :param save_path: 保存路径
     :param background_path:背景图片地址
     :param font_path:字体文件地址
     :param max_words:最大单词数
     :param max_font_size:最大字体
     :param background_color:背景颜色
     :return:
     '''
     Helper.check_file_exits_and_overwrite(save_path)
     if background_path is None:
         background_path = Constants.DEFAULT_BACKGROUND_PATH
     if font_path is None:
         font_path = Constants.DEFAULT_FONT_PATH
     color_mask = imread(background_path)
     cloud = WordCloud(font_path = font_path,background_color=background_color,
                       mask=color_mask,max_words=max_words,max_font_size = max_font_size)
     # 产生词云
     word_cloud = cloud.generate(cutted_words_text)
     word_cloud.to_file(save_path)
     self.logger.info("Successfully generate wordcloud img to {save_path}!".format(save_path=save_path))
Exemple #3
0
    def save_singer_all_hot_comments_to_file(self):
        '''
        保存歌手的全部热门评论到磁盘
        :param singer_name: 歌手名字
        :param singer_id:歌手 id
        '''
        save_path = self.singer_all_hot_comments_file_path
        Helper.check_file_exits_and_overwrite(save_path)
        song_ids = Helper.get_singer_hot_songs_ids(
            self.singer_url)  # 歌手全部歌曲id list
        if len(song_ids) == 0:
            self.logger.error(
                "crawl from %s to get %s all hot songs ids failed!" %
                (self.singer_url, self.singer_name))
            return
        # first line is headers
        all_hot_comments_list = []
        for song_id in song_ids:
            url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format(
                song_id=song_id)
            hot_comments_list = self.get_hot_comments(url)
            all_hot_comments_list.extend(hot_comments_list)
        all_hot_comments_json_str_list = [
            self.extract_comment_info_as_json_str(comment)
            for comment in all_hot_comments_list
        ]
        Helper.save_lines_to_file(all_hot_comments_json_str_list, save_path)

        self.logger.info(
            "Write {singer_name}'s {num} hot songs hot comments successfully!".
            format(singer_name=self.singer_name, num=len(song_ids)))
Exemple #4
0
 def save_all_users_info_to_file(self):
     '''
     保存一首歌曲下全部用户信息到磁盘
     :return:
     '''
     Helper.check_file_exits_and_overwrite(self.users_info_file_path)
     users_info_list = self.get_users_info_list()
     Helper.save_lines_to_file(users_info_list,self.users_info_file_path)
Exemple #5
0
 def save_sorted_bar_plot(self,datas,label,title,key_index,
                          save_path,reverse = False):
     '''
     绘制有序的柱状图并保存
     :param datas: 输入数据
     :param label: 标签
     :param title: 标题
     :param key_index: 排序的key index
     :param reverse:是否翻转排序(递减,默认递增)
     :param save_path: 保存路径
     :return:
     '''
     Helper.check_file_exits_and_overwrite(save_path)
     x,y = zip(*(sorted(Counter(datas).items(), key=itemgetter(key_index),reverse=reverse)))
     bar = Bar(title)
     bar.add(label,x,y)
     bar.render(save_path)
Exemple #6
0
 def save_all_comments_to_file(self):
     '''
     顺序保存全部评论到磁盘
     :return:
     '''
     Helper.check_file_exits_and_overwrite(self.comments_file_path)
     start_time = time.time()
     all_comments_list = self.get_all_comments()
     # comment dict to json str
     all_comments_json_str_list = [
         self.extract_comment_info_as_json_str(comment)
         for comment in all_comments_list
     ]
     Helper.save_lines_to_file(all_comments_json_str_list,
                               self.comments_file_path)
     end_time = time.time()
     print("It costs %.2f seconds to crawler <%s>." %
           (end_time - start_time, self.song_name))
Exemple #7
0
 def save_lyrics_to_file(self):
     '''
     保存歌曲歌词到磁盘
     :return:
     '''
     save_path = os.path.join(
         self.song_path,
         "{song_name}_lyrics.txt".format(song_name=self.song_name))
     Helper.check_file_exits_and_overwrite(save_path)
     lyrics_json = json.loads(self.get_lyrics_format_json())
     lyrics_str = lyrics_json['lrc']['lyric']
     pattern = r'\[\d+:\d+\.\d+\](.+?\n)'
     lyrics_list = re.findall(pattern, lyrics_str)
     with open(save_path, "w", encoding="utf-8") as f:
         f.write("{song_name}\n{singer_name}\n".format(
             song_name=self.song_name, singer_name=self.singer_name))
         f.writelines(lyrics_list)
     self.logger.info(
         "save {save_path} successfully!".format(save_path=save_path))
Exemple #8
0
    def save_all_comments_to_file_by_multi_threading(self, threads=10):
        '''
        使用多线程保存全部评论文件到磁盘
        :param threads:线程数
        '''
        self.no_counter_init()
        # 检查文件是否已经存在
        Helper.check_file_exits_and_overwrite(self.comments_file_path)
        start_time = time.time()
        total_comments_num, page = self.get_song_total_comments_num_and_page_num(
        )
        self.logger.info(
            "Song name:{song_name}".format(song_name=self.song_name))
        self.logger.info("There are %d pages of total %d comments!" %
                         (page, total_comments_num))

        pack = page // threads
        threads_list = []
        for i in range(threads):
            begin_page = i * pack
            if i < threads - 1:
                end_page = (i + 1) * pack
            else:
                end_page = page
            t = Thread(target=self.save_pages_comments,
                       args=(begin_page, end_page, total_comments_num))
            threads_list.append(t)
        for i in range(threads):
            threads_list[i].start()
        for i in range(threads):
            threads_list[i].join()
        end_time = time.time()
        self.logger.info(
            "Using {threads} threads,it costs {cost_time} seconds to crawl <{song_name}>'s all comments!"
            .format(threads=threads,
                    cost_time=(end_time - start_time),
                    song_name=self.song_name))
Exemple #9
0
    def core_visual_analyse(self):
        '''
        评论以及用户信息可视化,核心函数,使用pyecharts绘制
        1. 评论时间的分布,包括月和天,柱状图
        2. 赞同数分布,柱状图
        3. 去除停用词之后评论关键词的分布,柱状图
        4. 用户地理位置的分布,使用地图展示
        5. 用户地理位置的分布,使用柱状图展示
        6. 用户动态的分布,柱状图展示
        7. 用户关注人数的分布,柱状图展示
        8. 用户粉丝数的分布,柱状图展示
        9. 去停用词之后用户个人描述关键词分布,柱状图
        10. 用户年龄的分布,柱状图
        11. 用户听歌总数分布,柱状图
        '''
        plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME)
        Helper.mkdir(plot_save_path)
        # 加载全部评论
        comments_list = Helper.load_file_format_json(self.comments_file_path)
        # 加载全部用户信息
        users_info_list = Helper.load_file_format_json(self.users_info_file_path)


        # 1.评论时间的分布, 包括月和天, 柱状图
        comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list]
        # 年-月 格式的时间
        comments_date_year_month = []
        # 年-月-日 格式的时间
        comments_date_year_month_day = []
        for comment_time in comments_time:
            # 时间戳要除以1000得到实际的时间戳
            year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
            year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month.append(year_month)
            comments_date_year_month_day.append(year_month_day)

        self.save_sorted_bar_plot(
            datas = comments_date_year_month,
            label = "年-月",
            title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML)
        )

        self.save_sorted_bar_plot(
            datas = comments_date_year_month_day,
            label = "年-月-日",
            title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name),
            key_index = 0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML)
        )


        # 2. 赞同数分布,柱状图
        liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list
                            if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas = liked_count_list,
            label = "点赞数量",
            title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML)
        )

        # 3. 去除停用词之后评论关键词的分布,柱状图
        comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list])
        comments_keywords = Helper.cut_text(comments_text)
        # 移除长度小于2的词以及停用词
        stopwords = Helper.load_stopwords()
        comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]

        self.save_sorted_bar_plot(
            datas=comments_keywords,
            label="关键词",
            title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML),
            reverse=True
        )


        # 4. 用户地理位置的分布,使用地图展示
        users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list]
        users_city = [] # 用户所处城市
        all_support_cities = Helper.load_echarts_support_cities()
        for location in users_location:
            for city in all_support_cities:
                if city in location:
                    users_city.append(city)
                    break
        users_city_data = list(Counter(users_city).items()) 
        users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                width=1200, height=600, background_color='#404a59')
        attr, value = users_city_geo.cast(users_city_data)
        users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
        users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML)
        Helper.check_file_exits_and_overwrite(users_city_save_path)
        users_city_geo.render(users_city_save_path)



        # 5.用户地理位置分布的柱状图展示
        self.save_sorted_bar_plot(
            datas=users_location,
            label="用户所在地区",
            title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML),
            reverse=True
        )

        # 6. 用户动态数量的分布,柱状图展示
        events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=events_count_list,
            label="用户动态总数",
            title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML)
        )

        # 7. 用户关注人数的分布,柱状图展示
        follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=follow_count_list,
            label="用户关注人数",
            title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML)
        )

        # 8. 用户粉丝数的分布,柱状图展示
        fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list
                          if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=fan_count_list,
            label="用户粉丝人数",
            title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML)
        )


        # 9. 去停用词之后用户个人描述关键词分布,柱状图
        description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list])
        description_keywords = Helper.cut_text(description_text)
        description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
        self.save_sorted_bar_plot(
            datas=description_keywords_list,
            label="用户简介关键词",
            title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML),
            reverse=True
        )

        # 10. 用户年龄分布
        age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list
                          if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN]

        age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0
        self.save_sorted_bar_plot(
            datas=age_count_list,
            label="年龄",
            title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML)
        )

        # 11. 累计听歌数量分布
        listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list
                                    if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN]
        # 听歌数量离散化(因为极差太大)
        listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
        for c in listening_songs_num_list:
            if c < 100:
                listening_songs_dict['0-100'] += 1
            elif c < 1000:
                listening_songs_dict['100-1000'] += 1
            elif c < 10000:
                listening_songs_dict['1000-10000'] += 1
            else:
                listening_songs_dict['>10000'] += 1

        self.save_sorted_bar_plot(
            datas=listening_songs_dict,
            label="听歌总数",
            title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML),
            reverse=True
        )