Beispiel #1
0
 def save_all_users_info_to_file_by_multi_threading(self,threads = 10):
     '''
     多线程加速保存用户信息到磁盘
     :param threads: 线程数
     '''
     Helper.check_file_exits_and_overwrite(self.users_info_file_path)
     start_time = time.time()
     users_url = self.load_all_users_url()
     num = len(users_url)
     pack = num//threads # 每个线程处理的url数量
     threads_list = []
     # 计数器初始化
     self.no_counter_init()
     for i in range(threads):
         if i < threads-1:
             urls = users_url[i*pack:(i+1)*pack]
         else:
             urls = users_url[i*pack:]
         t = Thread(target = self.save_users_info,args=(urls,num))
         threads_list.append(t)
     for i in range(threads):
         threads_list[i].start()
     for i in range(threads):
         threads_list[i].join()
     end_time = time.time()
     self.logger.info("Using {threads} threads to save users info done,costs {cost_time} seconds"
             .format(threads = threads,cost_time = (end_time - start_time)))
Beispiel #2
0
    def save_singer_all_hot_comments_to_file(self):
        '''
        保存歌手的全部热门评论到磁盘
        :param singer_name: 歌手名字
        :param singer_id:歌手 id
        '''
        save_path = self.singer_all_hot_comments_file_path
        Helper.check_file_exits_and_overwrite(save_path)
        song_ids = Helper.get_singer_hot_songs_ids(
            self.singer_url)  # 歌手全部歌曲id list
        if len(song_ids) == 0:
            self.logger.error(
                "crawl from %s to get %s all hot songs ids failed!" %
                (self.singer_url, self.singer_name))
            return
        # first line is headers
        all_hot_comments_list = []
        for song_id in song_ids:
            url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format(
                song_id=song_id)
            hot_comments_list = self.get_hot_comments(url)
            all_hot_comments_list.extend(hot_comments_list)
        all_hot_comments_json_str_list = [
            self.extract_comment_info_as_json_str(comment)
            for comment in all_hot_comments_list
        ]
        Helper.save_lines_to_file(all_hot_comments_json_str_list, save_path)

        self.logger.info(
            "Write {singer_name}'s {num} hot songs hot comments successfully!".
            format(singer_name=self.singer_name, num=len(song_ids)))
Beispiel #3
0
 def draw_wordcloud(self,cutted_words_text,save_path,
                    background_path = None,font_path = None,
                    max_words = 2000,max_font_size = 40,background_color = 'white'):
     '''
     绘制词云,并保存图像到磁盘
     :param cutted_words_text: 已经切分好的,用空格分隔的word 字符串
     :param save_path: 保存路径
     :param background_path:背景图片地址
     :param font_path:字体文件地址
     :param max_words:最大单词数
     :param max_font_size:最大字体
     :param background_color:背景颜色
     :return:
     '''
     Helper.check_file_exits_and_overwrite(save_path)
     if background_path is None:
         background_path = Constants.DEFAULT_BACKGROUND_PATH
     if font_path is None:
         font_path = Constants.DEFAULT_FONT_PATH
     color_mask = imread(background_path)
     cloud = WordCloud(font_path = font_path,background_color=background_color,
                       mask=color_mask,max_words=max_words,max_font_size = max_font_size)
     # 产生词云
     word_cloud = cloud.generate(cutted_words_text)
     word_cloud.to_file(save_path)
     self.logger.info("Successfully generate wordcloud img to {save_path}!".format(save_path=save_path))
Beispiel #4
0
    def _download_list_songs_to_file(self,
                                     song_urls,
                                     save_path_list,
                                     total=None):
        '''
		批量通过歌曲的url list 下载歌曲到本地
		:param song_urls: 歌曲 download url list
		:param save_path_list: 歌曲保存地址list
		:return:
		'''
        n = len(song_urls)
        if n != len(save_path_list):
            raise ParamsError(
                "len(song_urls) must be equal to len(save_path_list)!")
        for i in range(n):
            Helper.download_network_resource(song_urls[i], save_path_list[i])
            if total is None:
                self.logger.info("Download %d/%d %s to %s!" %
                                 (i + 1, n, song_urls[i], save_path_list[i]))
            else:
                # 加锁,更新计数器
                if self.lock.acquire():
                    self.no_counter += 1
                    self.logger.info("Download %d/%d %s to %s!" %
                                     (self.no_counter, total, song_urls[i],
                                      save_path_list[i]))
                    self.lock.release()
Beispiel #5
0
 def save_all_users_info_to_file(self):
     '''
     保存一首歌曲下全部用户信息到磁盘
     :return:
     '''
     Helper.check_file_exits_and_overwrite(self.users_info_file_path)
     users_info_list = self.get_users_info_list()
     Helper.save_lines_to_file(users_info_list,self.users_info_file_path)
Beispiel #6
0
    def download_singer_hot_songs_by_name_with_multi_threading(
            self, singer_name, threads=20):
        '''
		通过输入歌手名字来下载歌手的全部热门歌曲,多线程实现
		:param singer_name: 歌手名字
		:param threads: 线程数
		:return:
		'''
        start_time = time.time()
        # 热门歌曲保存地址
        save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name,
                                 Constants.HOT_SONGS_SAVE_NAME)
        # 根据名字得到歌手id
        uid = self.get_singer_id_by_name(singer_name)
        # 歌手主页地址
        singer_url = "http://music.163.com/artist?id=%d" % uid
        # 歌手全部热门歌曲id list
        hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url)
        # 通过歌曲id得到下载url
        urls_list = self.get_download_urls_by_ids(hot_songs_ids)
        # 通过歌曲id获得歌曲名
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            hot_songs_ids)
        # 全部热门歌曲数
        total = len(urls_list)
        Helper.mkdir(save_path)
        self.logger.info("%s has total %d hot songs!" % (singer_name, total))
        self.logger.info(
            "(multi threads,thread_num = %d)Now start download hot musics of %s(save path is:%s):"
            % (threads, singer_name, save_path))
        # 计数器初始化为
        self.no_counter = 0
        threads_list = []
        pack = total // threads
        for i in range(threads):
            begin_index = i * pack
            if i < threads - 1:
                end_index = (i + 1) * pack
            else:
                end_index = total
            urls = urls_list[begin_index:end_index]
            save_list = [
                os.path.join(save_path, "%s.mp3" % name) for name in
                songs_name_and_singer_name_str_list[begin_index:end_index]
            ]
            t = Thread(target=self._download_list_songs_to_file,
                       args=(urls, save_list, total))
            threads_list.append(t)
        for thread in threads_list:
            thread.start()
        for thread in threads_list:
            thread.join()
        end_time = time.time()
        self.logger.info("Download %s's %d hot songs to %s succeed!"
                         "Costs %.2f seconds!" %
                         (singer_name, total, save_path,
                          (end_time - start_time)))
Beispiel #7
0
    def download_play_list_songs(self, play_list_id, limit=1000):
        '''
		下载歌单中的全部歌曲,单线程
		:param play_list_id: 歌单id
		:param limit: 下载的最大数量
		:return:
		'''
        start_time = time.time()
        # 获取歌单详情
        res = self.get_play_list_detail(play_list_id, limit).json()
        songs_id_list = []
        # 获取歌单歌曲id list
        for content in res['playlist']["trackIds"]:
            songs_id_list.append(content['id'])
        # 歌单名字
        play_list_name = res['playlist']['name']
        # 歌单下载音乐保存地址
        save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name)
        Helper.mkdir(save_path)
        # 获取歌曲名+歌手名字符串列表
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            songs_id_list)
        # 获取歌曲下载url list
        urls_list = self.get_download_urls_by_ids(songs_id_list)
        # 全部歌曲数目
        total = len(urls_list)
        self.logger.info("play list %s has total %d songs!" %
                         (play_list_name, total))
        self.logger.info(
            "(single thread)Now start download musics of %s(save path is:%s):"
            % (play_list_name, save_path))
        for index, url in enumerate(urls_list, 1):
            try:
                Helper.download_network_resource(
                    url,
                    os.path.join(
                        save_path, "%s.mp3" %
                        songs_name_and_singer_name_str_list[index - 1]))
                self.logger.info(
                    "Successfully download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
            except Exception:
                self.logger.info(
                    "Fail download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
                continue
        end_time = time.time()
        self.logger.info(
            "It costs %.2f seconds to download play list %s(id=%s)'s %d songs to %s "
            "using single thread!" % ((end_time - start_time), play_list_name,
                                      play_list_id, total, save_path))
Beispiel #8
0
 def save_users_info(self,users_url,total_urls_num):
     '''
     保存用户信息到磁盘,该函数会被save_users_info_to_file_by_multi_threading 多线程函数调用
     :param users_url: 待处理的用户url list
     :param total:全部用户url数量
     :param total_urls_num:全部url数量
     '''
     # 追加写入
     users_info_list = self.get_users_info_list(users_url,total_urls_num)
     # 写入文件需要加锁
     if self.lock.acquire():
         Helper.save_lines_to_file(users_info_list,self.users_info_file_path,"a")
         self.lock.release()
Beispiel #9
0
 def get_page_comments_format_raw_json(self, url, page):
     '''
     获取原生服务器返回的json格式的指定page评论结果
     :param url: 请求url
     :param page: 当前页数
     :return: raw json format comments
     '''
     params = Helper.get_params(page)
     json_text = Helper.get_json(url, params)
     if isinstance(json_text, bytes):
         json_text = json_text.decode(
             "utf-8")  # convert json_text from bytes to str
     return json_text
Beispiel #10
0
    def download_singer_hot_songs_by_name(self, singer_name):
        '''
		通过输入歌手名字来下载歌手的全部热门歌曲,单线程实现
		:param singer_name: 歌手名字
		:return:
		'''
        start_time = time.time()
        # 热门歌曲保存地址
        save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name,
                                 Constants.HOT_SONGS_SAVE_NAME)
        # 根据名字得到歌手id
        uid = self.get_singer_id_by_name(singer_name)
        # 歌手主页地址
        singer_url = "http://music.163.com/artist?id=%d" % uid
        # 歌手全部热门歌曲id list
        hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url)
        # 通过歌曲id得到下载url
        urls_list = self.get_download_urls_by_ids(hot_songs_ids)
        # 通过歌曲id获得歌曲名
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            hot_songs_ids)
        # 全部热门歌曲数
        total = len(urls_list)
        Helper.mkdir(save_path)
        self.logger.info("%s has total %d hot songs!" % (singer_name, total))
        self.logger.info(
            "(single thread)Now start download hot musics of %s(save path is:%s):"
            % (singer_name, save_path))
        for index, url in enumerate(urls_list, 1):
            try:
                # 下载
                Helper.download_network_resource(
                    url,
                    os.path.join(
                        save_path, "%s.mp3" %
                        songs_name_and_singer_name_str_list[index - 1]))
                self.logger.info(
                    "Successfully download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
            except Exception:
                self.logger.info(
                    "Fail download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
                continue
        end_time = time.time()
        self.logger.info(
            "It costs %.2f seconds to download singer %s's %d hot songs to %s "
            "using single thread!" %
            ((end_time - start_time), singer_name, total, save_path))
Beispiel #11
0
 def pretty_print_self_fans(self, offset=0, limit=30):
     '''
     格式化打印用户自身的粉丝信息
     :param offset: 起始位置
     :param limit: 最高返回数量
     :return:
     '''
     res = self.netcloud_login.get_self_fans(offset=offset, limit=limit).json()
     # 用户粉丝数
     num = len(res['followeds'])
     self.logger.info("My fans list is(count %d):" % num)
     # 逐个打印我的粉丝信息
     for index, content in enumerate(res['followeds'], 1):
         self.logger.info("-" * 20 + "  fans %d  " % index + "-" * 20)
         # 用户名
         self.logger.info("user name:%s" % content["nickname"])
         # 用户 id
         self.logger.info("user id:%s" % content["userId"])
         # 用户签名
         self.logger.info("user signature:%s" % content["signature"])
         # 用户性别
         self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female")
         # 头像地址
         self.logger.info("avatar url:%s" % content["avatarUrl"])
         # 歌单数量
         self.logger.info("play list count:%s" % content["playlistCount"])
         # 动态数量
         self.logger.info("event count:%s" % content["eventCount"])
         # 粉丝数量
         self.logger.info("fans count:%s" % content["followeds"])
         # 关注的人数
         self.logger.info("follows count:%s" % content["follows"])
         # 粉丝关注当前用户的信息(年月日)
         self.logger.info("follow time:%s" % Helper.from_timestamp_to_date(content["time"] * 0.001, "%Y-%m-%d"))
Beispiel #12
0
 def __init__(self,*args,**kwargs):
     self.logger = Helper.get_logger()
     # 初始化一个NetCloudLogin 对象
     if len(args) == 0 and len(kwargs.keys()) == 0:
         self.netcloud_login = NetCloudLogin()
     else:
         self.netcloud_login = NetCloudLogin(args,kwargs)
 def __init__(self):
     self.logger = Helper.get_logger()
     self.singer_name = "刘瑞琪"
     self.song_name = "离开的借口"
     self.crawler = Crawler.NetCloudCrawler(self.song_name,
                                            self.singer_name)
     self.singer_url = 'http://music.163.com/artist?id={singer_id}'.format(
         singer_id=self.crawler.singer_id)
Beispiel #14
0
 def draw_all_comments_wordcloud(self):
     '''
     产生歌曲全部评论的词云图像,全部使用默认参数
     :return:
     '''
     # 如果磁盘不存在,则先加载之,并保存到磁盘
     if not os.path.exists(self.comments_file_path):
         self.save_all_comments_to_file()
     all_comments_list = Helper.load_file_format_json(self.comments_file_path)
     if len(all_comments_list) == 0:
         self.logger.error("Load %s failed!" % self.comments_file_path)
         return
     all_comments_conent = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in all_comments_list])
     stopwords = Helper.load_stopwords()
     wordcloud_text = " ".join([word for word in Helper.cut_text(all_comments_conent) if word not in stopwords])
     save_path = os.path.join(self.song_path,"%s_all_comments.png" % self.song_name)
     self.draw_wordcloud(wordcloud_text,save_path)
Beispiel #15
0
 def save_sorted_bar_plot(self,datas,label,title,key_index,
                          save_path,reverse = False):
     '''
     绘制有序的柱状图并保存
     :param datas: 输入数据
     :param label: 标签
     :param title: 标题
     :param key_index: 排序的key index
     :param reverse:是否翻转排序(递减,默认递增)
     :param save_path: 保存路径
     :return:
     '''
     Helper.check_file_exits_and_overwrite(save_path)
     x,y = zip(*(sorted(Counter(datas).items(), key=itemgetter(key_index),reverse=reverse)))
     bar = Bar(title)
     bar.add(label,x,y)
     bar.render(save_path)
Beispiel #16
0
    def send(self):
        '''
		发送请求(核心请求函数)
		:return:
		'''
        # 请求方法不能为空
        if self.method is None:
            raise ParamsError()
        try:
            # 搜索方法
            if self.method == Constants.SEARCH_REQUEST_METHOD:
                # 构造请求
                req = self._get_requests()
                # 构造请求的url
                _url = Constants.MUSIC163_BASE_URL + Constants.REQUEST_METHODS[
                    self.method]
                # 发送请求
                resp = req.post(_url, data=self.data)
                # 构建一个Response
                self._build_response(resp)
                # 设置请求的状态为ok
                self.response.ok = True
            else:
                # 非搜索方法
                if isinstance(self.data, dict):  # data是字典编码的形式
                    # 对请求data进行加密
                    data = Helper.encrypted_request(self.data)
                # 使用webapi请求的形式
                req = self._get_webapi_requests()
                # 构造请求的url
                _url = Constants.MUSIC163_BASE_URL + Constants.REQUEST_METHODS[
                    self.method]
                # 用户dj,用户关注情况,用户动态
                # 需要填充用户自身的id
                if self.method in (Constants.USER_DJ_REQUEST_METHOD,
                                   Constants.USER_FOLLOWS_REQUEST_METHOD,
                                   Constants.USER_EVENT_REQUEST_METHOD):
                    _url = _url % self.params['uid']
                # 歌词,音乐评论
                # 需要填充歌曲id
                if self.method in (Constants.LYRIC_REQUEST_METHOD,
                                   Constants.MUSIC_COMMENT_REQUEST_METHOD,
                                   Constants.ALBUM_COMMENT_REQUEST_METHOD):
                    _url = _url % self.params['id']
                # 获取歌词不需要格外post数据
                if self.method == Constants.LYRIC_REQUEST_METHOD:
                    resp = req.get(_url)
                else:
                    # 其他的请求需要附加数据
                    resp = req.post(_url, data=data)
                self._build_response(resp)
                self.response.ok = True
        except Exception as why:
            # 打印报错栈
            traceback.print_exc()
            self.logger.info('Requests Exception', why)
            # 设置响应的异常信息
            self.response.error = why
Beispiel #17
0
 def save_all_comments_to_file(self):
     '''
     顺序保存全部评论到磁盘
     :return:
     '''
     Helper.check_file_exits_and_overwrite(self.comments_file_path)
     start_time = time.time()
     all_comments_list = self.get_all_comments()
     # comment dict to json str
     all_comments_json_str_list = [
         self.extract_comment_info_as_json_str(comment)
         for comment in all_comments_list
     ]
     Helper.save_lines_to_file(all_comments_json_str_list,
                               self.comments_file_path)
     end_time = time.time()
     print("It costs %.2f seconds to crawler <%s>." %
           (end_time - start_time, self.song_name))
Beispiel #18
0
 def save_lyrics_to_file(self):
     '''
     保存歌曲歌词到磁盘
     :return:
     '''
     save_path = os.path.join(
         self.song_path,
         "{song_name}_lyrics.txt".format(song_name=self.song_name))
     Helper.check_file_exits_and_overwrite(save_path)
     lyrics_json = json.loads(self.get_lyrics_format_json())
     lyrics_str = lyrics_json['lrc']['lyric']
     pattern = r'\[\d+:\d+\.\d+\](.+?\n)'
     lyrics_list = re.findall(pattern, lyrics_str)
     with open(save_path, "w", encoding="utf-8") as f:
         f.write("{song_name}\n{singer_name}\n".format(
             song_name=self.song_name, singer_name=self.singer_name))
         f.writelines(lyrics_list)
     self.logger.info(
         "save {save_path} successfully!".format(save_path=save_path))
Beispiel #19
0
 def save_pages_comments(self, begin_page, end_page, total_comments_num):
     '''
     保存从begin_page 到 end_page的评论(called by multi threading)
     :param begin_page: 开始页数
     :param end_page: 结束页数
     :param total_comments_num:全部评论数
     '''
     comments_info_list = []  # 保存全部评论的list,每条评论以json 字符串形式表示
     for i in range(begin_page, end_page):
         json_dict = self.get_page_comments_format_dict(
             self.comments_url, i + 1)
         try:
             for item in json_dict[Constants.COMMENTS_KEY]:
                 json_str = self.extract_comment_info_as_json_str(item)
                 # 更新计数器,需要加锁
                 if self.lock.acquire():
                     self.no_counter += 1
                     self.logger.info("get %d/%d music comment succeed!" %
                                      (self.no_counter, total_comments_num))
                     self.lock.release()
                 comments_info_list.append(json_str)
         except KeyError as key_error:
             self.logger.error("Fail to get page {page}.".format(page=i +
                                                                 1))
             self.logger.error(
                 "Server parse error:{error}".format(error=key_error))
         except Exception as e:
             self.logger.error("Fail to get page {page}.".format(page=i +
                                                                 1))
             self.logger.error(e)
         else:
             self.logger.info(
                 "Successfully to save page {page}.".format(page=i + 1))
     # 追加,加锁写入
     if self.lock.acquire():
         Helper.save_lines_to_file(comments_info_list,
                                   self.comments_file_path, "a")
         self.lock.release()
     self.logger.info(
         "Write page {begin_page} to {end_page} successfully!".format(
             begin_page=begin_page, end_page=end_page))
Beispiel #20
0
    def save_all_comments_to_file_by_multi_threading(self, threads=10):
        '''
        使用多线程保存全部评论文件到磁盘
        :param threads:线程数
        '''
        self.no_counter_init()
        # 检查文件是否已经存在
        Helper.check_file_exits_and_overwrite(self.comments_file_path)
        start_time = time.time()
        total_comments_num, page = self.get_song_total_comments_num_and_page_num(
        )
        self.logger.info(
            "Song name:{song_name}".format(song_name=self.song_name))
        self.logger.info("There are %d pages of total %d comments!" %
                         (page, total_comments_num))

        pack = page // threads
        threads_list = []
        for i in range(threads):
            begin_page = i * pack
            if i < threads - 1:
                end_page = (i + 1) * pack
            else:
                end_page = page
            t = Thread(target=self.save_pages_comments,
                       args=(begin_page, end_page, total_comments_num))
            threads_list.append(t)
        for i in range(threads):
            threads_list[i].start()
        for i in range(threads):
            threads_list[i].join()
        end_time = time.time()
        self.logger.info(
            "Using {threads} threads,it costs {cost_time} seconds to crawl <{song_name}>'s all comments!"
            .format(threads=threads,
                    cost_time=(end_time - start_time),
                    song_name=self.song_name))
Beispiel #21
0
 def pretty_print_self_info(self):
     '''
     格式化打印个人信息
     :return:
     '''
     info_dict = self.netcloud_login.login().json()
     avatarUrl = info_dict['profile']['avatarUrl'] # 头像地址
     signature = info_dict['profile']['signature'] # 个性签名
     nickname = info_dict['profile']['nickname'] # 昵称
     userName = info_dict['account']['userName'] # 用户名
     province_id = info_dict['profile']['province'] # 省份信息
     birthday_no = info_dict['profile']['birthday'] # 生日
     if birthday_no < 0:
         birthday = "unknown"
     else:
         birthday = Helper.from_timestamp_to_date(time_stamp=birthday_no * 0.001, format="%Y-%m-%d")
     description = info_dict['profile']['description']
     if info_dict['profile']['gender'] == 1:
         gender = 'male'
     elif info_dict['profile']['gender'] == 0:
         gender = 'female'
     else:
         gender = 'unknown'
     userId = info_dict['profile']['userId']
     cellphone = json.loads(info_dict['bindings'][0]['tokenJsonStr'])['cellphone'] # 手机号
     email = json.loads(info_dict['bindings'][1]['tokenJsonStr'])['email'] # 邮箱
     self.logger.info("Hello,{nickname}!\nHere is your personal info:".format(nickname=nickname))
     self.logger.info("avatarUrl:{avatarUrl}\nsignature:{signature}\n"
                      "nickname:{nickname}\n"
                      "userName:{userName}\nprovince_id:{province_id}\n"
                      "birthday:{birthday}\ndescription:{description}\n"
                      "gender:{gender}\nuserId:{userId}\n"
                      "cellphone:{cellphone}\nemail:{email}\n".format(
         avatarUrl=avatarUrl,
         signature=signature,
         nickname=nickname,
         userName=userName,
         province_id=province_id,
         birthday=birthday,
         description=description,
         gender=gender,
         userId=userId,
         cellphone=cellphone,
         email=email
     )
     )
Beispiel #22
0
 def pretty_print_search_song(self, search_song_name, offset=0, limit=30):
     '''
     格式化打印搜索一首歌返回的结果
     :param search_song_name: 搜索歌曲的名字
     :param offset: 起始位置
     :param limit: 最高返回数量
     :return:
     '''
     # 调用搜索接口
     res = self.netcloud_login.search(keyword=search_song_name, type_=1, offset=offset, limit=limit).json()
     # 搜索结果数量
     num = len(res['result']['songs'])  # search result num
     self.logger.info("Your search song name is:%s" % search_song_name)
     self.logger.info("Here is your search result(total %d):" % num)
     # 逐个打印搜索结果
     for index, content in enumerate(res['result']['songs'], 1):
         self.logger.info("-" * 20 + "  search result %d  " % index + "-" * 20)
         # 歌曲名字
         self.logger.info("song name:%s" % content['name'])
         # 歌曲别名
         self.logger.info("alias:%s" % content['alias'])
         # 歌手名(注意可能有多个歌手)
         self.logger.info("singer:")
         for artist in content['artists']:
             self.logger.info(artist['name'])
         # 专辑名
         self.logger.info("\nalbum:%s" % content['album']['name'])
         # 专辑发布时间(年月日)
         self.logger.info("album publish time:%s" %
                          Helper.from_timestamp_to_date(content['album']['publishTime'] * 0.001, format="%Y-%m-%d"))
         # 歌曲时长
         self.logger.info("song duration:%s m,%s s." % (content['duration'] // 60000,(content['duration'] // 1000 % 60)))
         # 歌曲id
         self.logger.info("song id:%s" % content["id"])
         # 歌手id(可能有多个歌手)
         self.logger.info("singer id:")
         for artist in content["artists"]:
             self.logger.info(artist['id'])
         # 专辑 id
         self.logger.info("\nalbum id:%s" % content['album']['id'])
         # mv id
         self.logger.info("mv id:%s" % content["mvid"])
Beispiel #23
0
 def load_all_users_url(self):
     '''
     从保存在磁盘的全部评论文件中,
     提取返回所有用户主页url list
     '''
     # list(dict)
     if not os.path.exists(self.comments_file_path):
         self.save_all_comments_to_file_by_multi_threading()
     comments_list = Helper.load_file_format_json(self.comments_file_path)
     # 全部用户id
     users_id = [comment[Constants.USER_ID_KEY] for comment in comments_list]
     # 全部用户数
     ids_num = len(users_id)
     # 用户id必须是数字字符串的形式
     users_id = [user_id for user_id in users_id if re.match(r'^\d+$',str(user_id))]
     users_url = []
     for user_id in users_id:
         users_url.append('http://music.163.com/user/home?id={user_id}'.format(user_id = user_id))
     # 去重
     return list(set(users_url))
Beispiel #24
0
 def pretty_print_search_user(self, keyword, offset=0, limit=30):
     '''
     格式化打印搜索用户的信息
     :param keyword: 搜索关键字
     :param offset: 起始位置
     :param limit: 最高返回数量
     :return:
     '''
     res = self.netcloud_login.search(keyword, type_=1002, offset=offset, limit=limit).json()
     # 返回用户总数总数
     num = len(res['result']["userprofiles"])
     # 搜索用户关键字
     self.logger.info("Your search user keyword is:%s" % keyword)
     self.logger.info("Here is your search result(%d count):" % num)
     for index, content in enumerate(res['result']['userprofiles'], 1):
         self.logger.info("-" * 20 + "  search result %d  " % index + "-" * 20)
         # 用户名
         self.logger.info("user name:%s" % content['nickname'])
         # 用户签名
         self.logger.info("user signature:%s" % content["signature"])
         # 用户描述
         self.logger.info("user description:%s" % content["description"])
         # 用户具体描述
         self.logger.info("user detail description:%s" % content["detailDescription"])
         # 用户id
         self.logger.info("user id:%s" % content["userId"])
         # 省份信息
         self.logger.info("province id:%s" % content["province"])
         # 城市信息
         self.logger.info("city id:%s" % content["city"])
         # 性别
         self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female")
         # 生日
         self.logger.info("birthday:%s" % Helper.from_timestamp_to_date(content["birthday"] * 0.001, "%Y-%m-%d"))
         # 头像url
         self.logger.info("avatar url:%s" % content["avatarUrl"])
         # 背景图像url
         self.logger.info("background image url:%s" % content["backgroundUrl"])
 def test_get_singer_hot_songs_ids(self):
     self.logger.info(Helper.get_singer_hot_songs_ids(self.singer_url))
Beispiel #26
0
 def get_users_info_list(self,users_url = None,total_urls_num = None):
     '''
     获取一周歌曲下全部用户信息list
     :param users_url: 传入用户url list
     :param total_urls_num: 全部urls 数量,默认是None,不为None时,说明正在进行多线程调用
     :return: list(dict)
     '''
     users_info_list = []
     if users_url is None:
         # 获取歌曲下全部用户url list
         users_url = self.load_all_users_url()
     num = len(users_url)
     # 遍历每个用户url
     for index, user_url in enumerate(users_url, 1):
         try:
             user_id = re.search(r'.*id=(\d+)', user_url).group(1)  # 用户id
             # 抓取时间
             crawler_time = Helper.from_timestamp_to_date(time_stamp=time.time())
             # 获取html
             html = requests.get(user_url, headers=Constants.REQUEST_HEADERS).text
             # 动态总数
             event_count_pattern = re.compile(r'<strong id="event_count">(\d+?)</strong>')
             event_count = re.search(event_count_pattern, html)
             if event_count:
                 event_count = event_count.group(1)
             else:
                 event_count = Constants.UNKNOWN_TOKEN
             # 用户关注数
             follow_count_pattern = re.compile(r'<strong id="follow_count">(\d+?)</strong>')
             follow_count = re.search(follow_count_pattern, html)
             if follow_count:
                 follow_count = follow_count.group(1)
             else:
                 follow_count = Constants.UNKNOWN_TOKEN
             # 用户粉丝数
             fan_count_pattern = re.compile(r'<strong id="fan_count">(\d+?)</strong>')
             fan_count = re.search(fan_count_pattern, html)
             if fan_count:
                 fan_count = fan_count.group(1)
             else:
                 fan_count = Constants.UNKNOWN_TOKEN
             # 用户所在地区
             location_pattern = re.compile('<span>所在地区:(.+?)</span>')
             location = re.search(location_pattern, html)
             if location:
                 location = location.group(1)
             else:
                 location = Constants.UNKNOWN_TOKEN
             # 用户个人描述
             description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>')
             description = re.search(description_pattern, html)
             if description:
                 description = description.group(1)
             else:
                 description = Constants.UNKNOWN_TOKEN
             # 用户年龄
             age_pattern = re.compile(r'<span.*?data-age="(\d+)">')
             age = re.search(age_pattern, html)
             if age:
                 age = age.group(1)  # 时间戳形式
                 # 今年
                 current_year = int(Helper.from_timestamp_to_date(time_stamp=time.time(), format="%Y"))
                 # 得到用户真实年龄
                 age = (current_year - 1970) - int(age) // (1000 * 365 * 24 * 3600)
             else:
                 age = Constants.UNKNOWN_TOKEN
             # 累计听歌
             listening_songs_num_pattern = re.compile('<h4>累积听歌(\d+?)首</h4>')
             listening_songs_num = re.search(listening_songs_num_pattern, html)
             if listening_songs_num:
                 listening_songs_num = listening_songs_num.group(1)
             else:
                 listening_songs_num = Constants.UNKNOWN_TOKEN
             # 将用户信息以json形式保存到磁盘
             user_info_dict = {
                 Constants.USER_ID_KEY: user_id,
                 Constants.CRAWLER_TIME_KEY: crawler_time,
                 Constants.EVENT_COUNT_KEY: event_count,
                 Constants.FOLLOW_COUNT_KEY: follow_count,
                 Constants.FAN_COUNT_KEY: fan_count,
                 Constants.LOCATION_KEY: location,
                 Constants.USER_DESCRIPTION_KEY: description,
                 Constants.USER_AGE_KEY: age,
                 Constants.LISTENING_SONGS_NUM_KEY: listening_songs_num
             }
             user_info_json_str = json.dumps(user_info_dict, ensure_ascii=False)
             users_info_list.append(user_info_json_str)
             if total_urls_num: # 多线程调用
                 if self.lock.acquire():
                     self.no_counter += 1
                     self.logger.info(
                         "Write {current}/{total} user info to file successfully!".format(current=self.no_counter, total=total_urls_num))
                     self.lock.release()
             else: # 普通单线程调用
                 self.logger.info(
                     "Write {current}/{total} user info to file successfully!".format(current=index, total=num))
         except Exception as e:
             self.logger.error("Fail to get No.{index} comment user's info:{error}"
                               .format(index=index, error=e))
     return users_info_list
Beispiel #27
0
    def core_visual_analyse(self):
        '''
        评论以及用户信息可视化,核心函数,使用pyecharts绘制
        1. 评论时间的分布,包括月和天,柱状图
        2. 赞同数分布,柱状图
        3. 去除停用词之后评论关键词的分布,柱状图
        4. 用户地理位置的分布,使用地图展示
        5. 用户地理位置的分布,使用柱状图展示
        6. 用户动态的分布,柱状图展示
        7. 用户关注人数的分布,柱状图展示
        8. 用户粉丝数的分布,柱状图展示
        9. 去停用词之后用户个人描述关键词分布,柱状图
        10. 用户年龄的分布,柱状图
        11. 用户听歌总数分布,柱状图
        '''
        plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME)
        Helper.mkdir(plot_save_path)
        # 加载全部评论
        comments_list = Helper.load_file_format_json(self.comments_file_path)
        # 加载全部用户信息
        users_info_list = Helper.load_file_format_json(self.users_info_file_path)


        # 1.评论时间的分布, 包括月和天, 柱状图
        comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list]
        # 年-月 格式的时间
        comments_date_year_month = []
        # 年-月-日 格式的时间
        comments_date_year_month_day = []
        for comment_time in comments_time:
            # 时间戳要除以1000得到实际的时间戳
            year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
            year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month.append(year_month)
            comments_date_year_month_day.append(year_month_day)

        self.save_sorted_bar_plot(
            datas = comments_date_year_month,
            label = "年-月",
            title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML)
        )

        self.save_sorted_bar_plot(
            datas = comments_date_year_month_day,
            label = "年-月-日",
            title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name),
            key_index = 0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML)
        )


        # 2. 赞同数分布,柱状图
        liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list
                            if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas = liked_count_list,
            label = "点赞数量",
            title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML)
        )

        # 3. 去除停用词之后评论关键词的分布,柱状图
        comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list])
        comments_keywords = Helper.cut_text(comments_text)
        # 移除长度小于2的词以及停用词
        stopwords = Helper.load_stopwords()
        comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]

        self.save_sorted_bar_plot(
            datas=comments_keywords,
            label="关键词",
            title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML),
            reverse=True
        )


        # 4. 用户地理位置的分布,使用地图展示
        users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list]
        users_city = [] # 用户所处城市
        all_support_cities = Helper.load_echarts_support_cities()
        for location in users_location:
            for city in all_support_cities:
                if city in location:
                    users_city.append(city)
                    break
        users_city_data = list(Counter(users_city).items()) 
        users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                width=1200, height=600, background_color='#404a59')
        attr, value = users_city_geo.cast(users_city_data)
        users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
        users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML)
        Helper.check_file_exits_and_overwrite(users_city_save_path)
        users_city_geo.render(users_city_save_path)



        # 5.用户地理位置分布的柱状图展示
        self.save_sorted_bar_plot(
            datas=users_location,
            label="用户所在地区",
            title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML),
            reverse=True
        )

        # 6. 用户动态数量的分布,柱状图展示
        events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=events_count_list,
            label="用户动态总数",
            title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML)
        )

        # 7. 用户关注人数的分布,柱状图展示
        follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=follow_count_list,
            label="用户关注人数",
            title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML)
        )

        # 8. 用户粉丝数的分布,柱状图展示
        fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list
                          if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=fan_count_list,
            label="用户粉丝人数",
            title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML)
        )


        # 9. 去停用词之后用户个人描述关键词分布,柱状图
        description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list])
        description_keywords = Helper.cut_text(description_text)
        description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
        self.save_sorted_bar_plot(
            datas=description_keywords_list,
            label="用户简介关键词",
            title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML),
            reverse=True
        )

        # 10. 用户年龄分布
        age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list
                          if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN]

        age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0
        self.save_sorted_bar_plot(
            datas=age_count_list,
            label="年龄",
            title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML)
        )

        # 11. 累计听歌数量分布
        listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list
                                    if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN]
        # 听歌数量离散化(因为极差太大)
        listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
        for c in listening_songs_num_list:
            if c < 100:
                listening_songs_dict['0-100'] += 1
            elif c < 1000:
                listening_songs_dict['100-1000'] += 1
            elif c < 10000:
                listening_songs_dict['1000-10000'] += 1
            else:
                listening_songs_dict['>10000'] += 1

        self.save_sorted_bar_plot(
            datas=listening_songs_dict,
            label="听歌总数",
            title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML),
            reverse=True
        )
Beispiel #28
0
 def __init__(self):
     self.logger = Helper.get_logger()
     # 无参数登录
     self.login_printer = NetCloudPrinter()
     self.netcloud_login = NetCloudLogin()
Beispiel #29
0
 def test_get_download_urls_by_ids(self):
     singer_url = "http://music.163.com/artist?id=9621"
     ids_list = Helper.get_singer_hot_songs_ids(singer_url)
     self.logger.info(
         self.netcloud_login.get_download_urls_by_ids(ids_list))
Beispiel #30
0
 def test_get_songs_name_list_by_ids_list(self):
     singer_url = "http://music.163.com/artist?id=7214"
     ids_list = Helper.get_singer_hot_songs_ids(singer_url)
     self.logger.info(
         self.netcloud_login.get_songs_name_list_by_ids_list(ids_list))