def save_all_users_info_to_file_by_multi_threading(self,threads = 10): ''' 多线程加速保存用户信息到磁盘 :param threads: 线程数 ''' Helper.check_file_exits_and_overwrite(self.users_info_file_path) start_time = time.time() users_url = self.load_all_users_url() num = len(users_url) pack = num//threads # 每个线程处理的url数量 threads_list = [] # 计数器初始化 self.no_counter_init() for i in range(threads): if i < threads-1: urls = users_url[i*pack:(i+1)*pack] else: urls = users_url[i*pack:] t = Thread(target = self.save_users_info,args=(urls,num)) threads_list.append(t) for i in range(threads): threads_list[i].start() for i in range(threads): threads_list[i].join() end_time = time.time() self.logger.info("Using {threads} threads to save users info done,costs {cost_time} seconds" .format(threads = threads,cost_time = (end_time - start_time)))
def save_singer_all_hot_comments_to_file(self): ''' 保存歌手的全部热门评论到磁盘 :param singer_name: 歌手名字 :param singer_id:歌手 id ''' save_path = self.singer_all_hot_comments_file_path Helper.check_file_exits_and_overwrite(save_path) song_ids = Helper.get_singer_hot_songs_ids( self.singer_url) # 歌手全部歌曲id list if len(song_ids) == 0: self.logger.error( "crawl from %s to get %s all hot songs ids failed!" % (self.singer_url, self.singer_name)) return # first line is headers all_hot_comments_list = [] for song_id in song_ids: url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format( song_id=song_id) hot_comments_list = self.get_hot_comments(url) all_hot_comments_list.extend(hot_comments_list) all_hot_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_hot_comments_list ] Helper.save_lines_to_file(all_hot_comments_json_str_list, save_path) self.logger.info( "Write {singer_name}'s {num} hot songs hot comments successfully!". format(singer_name=self.singer_name, num=len(song_ids)))
def draw_wordcloud(self,cutted_words_text,save_path, background_path = None,font_path = None, max_words = 2000,max_font_size = 40,background_color = 'white'): ''' 绘制词云,并保存图像到磁盘 :param cutted_words_text: 已经切分好的,用空格分隔的word 字符串 :param save_path: 保存路径 :param background_path:背景图片地址 :param font_path:字体文件地址 :param max_words:最大单词数 :param max_font_size:最大字体 :param background_color:背景颜色 :return: ''' Helper.check_file_exits_and_overwrite(save_path) if background_path is None: background_path = Constants.DEFAULT_BACKGROUND_PATH if font_path is None: font_path = Constants.DEFAULT_FONT_PATH color_mask = imread(background_path) cloud = WordCloud(font_path = font_path,background_color=background_color, mask=color_mask,max_words=max_words,max_font_size = max_font_size) # 产生词云 word_cloud = cloud.generate(cutted_words_text) word_cloud.to_file(save_path) self.logger.info("Successfully generate wordcloud img to {save_path}!".format(save_path=save_path))
def _download_list_songs_to_file(self, song_urls, save_path_list, total=None): ''' 批量通过歌曲的url list 下载歌曲到本地 :param song_urls: 歌曲 download url list :param save_path_list: 歌曲保存地址list :return: ''' n = len(song_urls) if n != len(save_path_list): raise ParamsError( "len(song_urls) must be equal to len(save_path_list)!") for i in range(n): Helper.download_network_resource(song_urls[i], save_path_list[i]) if total is None: self.logger.info("Download %d/%d %s to %s!" % (i + 1, n, song_urls[i], save_path_list[i])) else: # 加锁,更新计数器 if self.lock.acquire(): self.no_counter += 1 self.logger.info("Download %d/%d %s to %s!" % (self.no_counter, total, song_urls[i], save_path_list[i])) self.lock.release()
def save_all_users_info_to_file(self): ''' 保存一首歌曲下全部用户信息到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.users_info_file_path) users_info_list = self.get_users_info_list() Helper.save_lines_to_file(users_info_list,self.users_info_file_path)
def download_singer_hot_songs_by_name_with_multi_threading( self, singer_name, threads=20): ''' 通过输入歌手名字来下载歌手的全部热门歌曲,多线程实现 :param singer_name: 歌手名字 :param threads: 线程数 :return: ''' start_time = time.time() # 热门歌曲保存地址 save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name, Constants.HOT_SONGS_SAVE_NAME) # 根据名字得到歌手id uid = self.get_singer_id_by_name(singer_name) # 歌手主页地址 singer_url = "http://music.163.com/artist?id=%d" % uid # 歌手全部热门歌曲id list hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url) # 通过歌曲id得到下载url urls_list = self.get_download_urls_by_ids(hot_songs_ids) # 通过歌曲id获得歌曲名 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( hot_songs_ids) # 全部热门歌曲数 total = len(urls_list) Helper.mkdir(save_path) self.logger.info("%s has total %d hot songs!" % (singer_name, total)) self.logger.info( "(multi threads,thread_num = %d)Now start download hot musics of %s(save path is:%s):" % (threads, singer_name, save_path)) # 计数器初始化为 self.no_counter = 0 threads_list = [] pack = total // threads for i in range(threads): begin_index = i * pack if i < threads - 1: end_index = (i + 1) * pack else: end_index = total urls = urls_list[begin_index:end_index] save_list = [ os.path.join(save_path, "%s.mp3" % name) for name in songs_name_and_singer_name_str_list[begin_index:end_index] ] t = Thread(target=self._download_list_songs_to_file, args=(urls, save_list, total)) threads_list.append(t) for thread in threads_list: thread.start() for thread in threads_list: thread.join() end_time = time.time() self.logger.info("Download %s's %d hot songs to %s succeed!" "Costs %.2f seconds!" % (singer_name, total, save_path, (end_time - start_time)))
def download_play_list_songs(self, play_list_id, limit=1000): ''' 下载歌单中的全部歌曲,单线程 :param play_list_id: 歌单id :param limit: 下载的最大数量 :return: ''' start_time = time.time() # 获取歌单详情 res = self.get_play_list_detail(play_list_id, limit).json() songs_id_list = [] # 获取歌单歌曲id list for content in res['playlist']["trackIds"]: songs_id_list.append(content['id']) # 歌单名字 play_list_name = res['playlist']['name'] # 歌单下载音乐保存地址 save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name) Helper.mkdir(save_path) # 获取歌曲名+歌手名字符串列表 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( songs_id_list) # 获取歌曲下载url list urls_list = self.get_download_urls_by_ids(songs_id_list) # 全部歌曲数目 total = len(urls_list) self.logger.info("play list %s has total %d songs!" % (play_list_name, total)) self.logger.info( "(single thread)Now start download musics of %s(save path is:%s):" % (play_list_name, save_path)) for index, url in enumerate(urls_list, 1): try: Helper.download_network_resource( url, os.path.join( save_path, "%s.mp3" % songs_name_and_singer_name_str_list[index - 1])) self.logger.info( "Successfully download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) except Exception: self.logger.info( "Fail download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) continue end_time = time.time() self.logger.info( "It costs %.2f seconds to download play list %s(id=%s)'s %d songs to %s " "using single thread!" % ((end_time - start_time), play_list_name, play_list_id, total, save_path))
def save_users_info(self,users_url,total_urls_num): ''' 保存用户信息到磁盘,该函数会被save_users_info_to_file_by_multi_threading 多线程函数调用 :param users_url: 待处理的用户url list :param total:全部用户url数量 :param total_urls_num:全部url数量 ''' # 追加写入 users_info_list = self.get_users_info_list(users_url,total_urls_num) # 写入文件需要加锁 if self.lock.acquire(): Helper.save_lines_to_file(users_info_list,self.users_info_file_path,"a") self.lock.release()
def get_page_comments_format_raw_json(self, url, page): ''' 获取原生服务器返回的json格式的指定page评论结果 :param url: 请求url :param page: 当前页数 :return: raw json format comments ''' params = Helper.get_params(page) json_text = Helper.get_json(url, params) if isinstance(json_text, bytes): json_text = json_text.decode( "utf-8") # convert json_text from bytes to str return json_text
def download_singer_hot_songs_by_name(self, singer_name): ''' 通过输入歌手名字来下载歌手的全部热门歌曲,单线程实现 :param singer_name: 歌手名字 :return: ''' start_time = time.time() # 热门歌曲保存地址 save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name, Constants.HOT_SONGS_SAVE_NAME) # 根据名字得到歌手id uid = self.get_singer_id_by_name(singer_name) # 歌手主页地址 singer_url = "http://music.163.com/artist?id=%d" % uid # 歌手全部热门歌曲id list hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url) # 通过歌曲id得到下载url urls_list = self.get_download_urls_by_ids(hot_songs_ids) # 通过歌曲id获得歌曲名 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( hot_songs_ids) # 全部热门歌曲数 total = len(urls_list) Helper.mkdir(save_path) self.logger.info("%s has total %d hot songs!" % (singer_name, total)) self.logger.info( "(single thread)Now start download hot musics of %s(save path is:%s):" % (singer_name, save_path)) for index, url in enumerate(urls_list, 1): try: # 下载 Helper.download_network_resource( url, os.path.join( save_path, "%s.mp3" % songs_name_and_singer_name_str_list[index - 1])) self.logger.info( "Successfully download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) except Exception: self.logger.info( "Fail download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) continue end_time = time.time() self.logger.info( "It costs %.2f seconds to download singer %s's %d hot songs to %s " "using single thread!" % ((end_time - start_time), singer_name, total, save_path))
def pretty_print_self_fans(self, offset=0, limit=30): ''' 格式化打印用户自身的粉丝信息 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' res = self.netcloud_login.get_self_fans(offset=offset, limit=limit).json() # 用户粉丝数 num = len(res['followeds']) self.logger.info("My fans list is(count %d):" % num) # 逐个打印我的粉丝信息 for index, content in enumerate(res['followeds'], 1): self.logger.info("-" * 20 + " fans %d " % index + "-" * 20) # 用户名 self.logger.info("user name:%s" % content["nickname"]) # 用户 id self.logger.info("user id:%s" % content["userId"]) # 用户签名 self.logger.info("user signature:%s" % content["signature"]) # 用户性别 self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female") # 头像地址 self.logger.info("avatar url:%s" % content["avatarUrl"]) # 歌单数量 self.logger.info("play list count:%s" % content["playlistCount"]) # 动态数量 self.logger.info("event count:%s" % content["eventCount"]) # 粉丝数量 self.logger.info("fans count:%s" % content["followeds"]) # 关注的人数 self.logger.info("follows count:%s" % content["follows"]) # 粉丝关注当前用户的信息(年月日) self.logger.info("follow time:%s" % Helper.from_timestamp_to_date(content["time"] * 0.001, "%Y-%m-%d"))
def __init__(self,*args,**kwargs): self.logger = Helper.get_logger() # 初始化一个NetCloudLogin 对象 if len(args) == 0 and len(kwargs.keys()) == 0: self.netcloud_login = NetCloudLogin() else: self.netcloud_login = NetCloudLogin(args,kwargs)
def __init__(self): self.logger = Helper.get_logger() self.singer_name = "刘瑞琪" self.song_name = "离开的借口" self.crawler = Crawler.NetCloudCrawler(self.song_name, self.singer_name) self.singer_url = 'http://music.163.com/artist?id={singer_id}'.format( singer_id=self.crawler.singer_id)
def draw_all_comments_wordcloud(self): ''' 产生歌曲全部评论的词云图像,全部使用默认参数 :return: ''' # 如果磁盘不存在,则先加载之,并保存到磁盘 if not os.path.exists(self.comments_file_path): self.save_all_comments_to_file() all_comments_list = Helper.load_file_format_json(self.comments_file_path) if len(all_comments_list) == 0: self.logger.error("Load %s failed!" % self.comments_file_path) return all_comments_conent = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in all_comments_list]) stopwords = Helper.load_stopwords() wordcloud_text = " ".join([word for word in Helper.cut_text(all_comments_conent) if word not in stopwords]) save_path = os.path.join(self.song_path,"%s_all_comments.png" % self.song_name) self.draw_wordcloud(wordcloud_text,save_path)
def save_sorted_bar_plot(self,datas,label,title,key_index, save_path,reverse = False): ''' 绘制有序的柱状图并保存 :param datas: 输入数据 :param label: 标签 :param title: 标题 :param key_index: 排序的key index :param reverse:是否翻转排序(递减,默认递增) :param save_path: 保存路径 :return: ''' Helper.check_file_exits_and_overwrite(save_path) x,y = zip(*(sorted(Counter(datas).items(), key=itemgetter(key_index),reverse=reverse))) bar = Bar(title) bar.add(label,x,y) bar.render(save_path)
def send(self): ''' 发送请求(核心请求函数) :return: ''' # 请求方法不能为空 if self.method is None: raise ParamsError() try: # 搜索方法 if self.method == Constants.SEARCH_REQUEST_METHOD: # 构造请求 req = self._get_requests() # 构造请求的url _url = Constants.MUSIC163_BASE_URL + Constants.REQUEST_METHODS[ self.method] # 发送请求 resp = req.post(_url, data=self.data) # 构建一个Response self._build_response(resp) # 设置请求的状态为ok self.response.ok = True else: # 非搜索方法 if isinstance(self.data, dict): # data是字典编码的形式 # 对请求data进行加密 data = Helper.encrypted_request(self.data) # 使用webapi请求的形式 req = self._get_webapi_requests() # 构造请求的url _url = Constants.MUSIC163_BASE_URL + Constants.REQUEST_METHODS[ self.method] # 用户dj,用户关注情况,用户动态 # 需要填充用户自身的id if self.method in (Constants.USER_DJ_REQUEST_METHOD, Constants.USER_FOLLOWS_REQUEST_METHOD, Constants.USER_EVENT_REQUEST_METHOD): _url = _url % self.params['uid'] # 歌词,音乐评论 # 需要填充歌曲id if self.method in (Constants.LYRIC_REQUEST_METHOD, Constants.MUSIC_COMMENT_REQUEST_METHOD, Constants.ALBUM_COMMENT_REQUEST_METHOD): _url = _url % self.params['id'] # 获取歌词不需要格外post数据 if self.method == Constants.LYRIC_REQUEST_METHOD: resp = req.get(_url) else: # 其他的请求需要附加数据 resp = req.post(_url, data=data) self._build_response(resp) self.response.ok = True except Exception as why: # 打印报错栈 traceback.print_exc() self.logger.info('Requests Exception', why) # 设置响应的异常信息 self.response.error = why
def save_all_comments_to_file(self): ''' 顺序保存全部评论到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.comments_file_path) start_time = time.time() all_comments_list = self.get_all_comments() # comment dict to json str all_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_comments_list ] Helper.save_lines_to_file(all_comments_json_str_list, self.comments_file_path) end_time = time.time() print("It costs %.2f seconds to crawler <%s>." % (end_time - start_time, self.song_name))
def save_lyrics_to_file(self): ''' 保存歌曲歌词到磁盘 :return: ''' save_path = os.path.join( self.song_path, "{song_name}_lyrics.txt".format(song_name=self.song_name)) Helper.check_file_exits_and_overwrite(save_path) lyrics_json = json.loads(self.get_lyrics_format_json()) lyrics_str = lyrics_json['lrc']['lyric'] pattern = r'\[\d+:\d+\.\d+\](.+?\n)' lyrics_list = re.findall(pattern, lyrics_str) with open(save_path, "w", encoding="utf-8") as f: f.write("{song_name}\n{singer_name}\n".format( song_name=self.song_name, singer_name=self.singer_name)) f.writelines(lyrics_list) self.logger.info( "save {save_path} successfully!".format(save_path=save_path))
def save_pages_comments(self, begin_page, end_page, total_comments_num): ''' 保存从begin_page 到 end_page的评论(called by multi threading) :param begin_page: 开始页数 :param end_page: 结束页数 :param total_comments_num:全部评论数 ''' comments_info_list = [] # 保存全部评论的list,每条评论以json 字符串形式表示 for i in range(begin_page, end_page): json_dict = self.get_page_comments_format_dict( self.comments_url, i + 1) try: for item in json_dict[Constants.COMMENTS_KEY]: json_str = self.extract_comment_info_as_json_str(item) # 更新计数器,需要加锁 if self.lock.acquire(): self.no_counter += 1 self.logger.info("get %d/%d music comment succeed!" % (self.no_counter, total_comments_num)) self.lock.release() comments_info_list.append(json_str) except KeyError as key_error: self.logger.error("Fail to get page {page}.".format(page=i + 1)) self.logger.error( "Server parse error:{error}".format(error=key_error)) except Exception as e: self.logger.error("Fail to get page {page}.".format(page=i + 1)) self.logger.error(e) else: self.logger.info( "Successfully to save page {page}.".format(page=i + 1)) # 追加,加锁写入 if self.lock.acquire(): Helper.save_lines_to_file(comments_info_list, self.comments_file_path, "a") self.lock.release() self.logger.info( "Write page {begin_page} to {end_page} successfully!".format( begin_page=begin_page, end_page=end_page))
def save_all_comments_to_file_by_multi_threading(self, threads=10): ''' 使用多线程保存全部评论文件到磁盘 :param threads:线程数 ''' self.no_counter_init() # 检查文件是否已经存在 Helper.check_file_exits_and_overwrite(self.comments_file_path) start_time = time.time() total_comments_num, page = self.get_song_total_comments_num_and_page_num( ) self.logger.info( "Song name:{song_name}".format(song_name=self.song_name)) self.logger.info("There are %d pages of total %d comments!" % (page, total_comments_num)) pack = page // threads threads_list = [] for i in range(threads): begin_page = i * pack if i < threads - 1: end_page = (i + 1) * pack else: end_page = page t = Thread(target=self.save_pages_comments, args=(begin_page, end_page, total_comments_num)) threads_list.append(t) for i in range(threads): threads_list[i].start() for i in range(threads): threads_list[i].join() end_time = time.time() self.logger.info( "Using {threads} threads,it costs {cost_time} seconds to crawl <{song_name}>'s all comments!" .format(threads=threads, cost_time=(end_time - start_time), song_name=self.song_name))
def pretty_print_self_info(self): ''' 格式化打印个人信息 :return: ''' info_dict = self.netcloud_login.login().json() avatarUrl = info_dict['profile']['avatarUrl'] # 头像地址 signature = info_dict['profile']['signature'] # 个性签名 nickname = info_dict['profile']['nickname'] # 昵称 userName = info_dict['account']['userName'] # 用户名 province_id = info_dict['profile']['province'] # 省份信息 birthday_no = info_dict['profile']['birthday'] # 生日 if birthday_no < 0: birthday = "unknown" else: birthday = Helper.from_timestamp_to_date(time_stamp=birthday_no * 0.001, format="%Y-%m-%d") description = info_dict['profile']['description'] if info_dict['profile']['gender'] == 1: gender = 'male' elif info_dict['profile']['gender'] == 0: gender = 'female' else: gender = 'unknown' userId = info_dict['profile']['userId'] cellphone = json.loads(info_dict['bindings'][0]['tokenJsonStr'])['cellphone'] # 手机号 email = json.loads(info_dict['bindings'][1]['tokenJsonStr'])['email'] # 邮箱 self.logger.info("Hello,{nickname}!\nHere is your personal info:".format(nickname=nickname)) self.logger.info("avatarUrl:{avatarUrl}\nsignature:{signature}\n" "nickname:{nickname}\n" "userName:{userName}\nprovince_id:{province_id}\n" "birthday:{birthday}\ndescription:{description}\n" "gender:{gender}\nuserId:{userId}\n" "cellphone:{cellphone}\nemail:{email}\n".format( avatarUrl=avatarUrl, signature=signature, nickname=nickname, userName=userName, province_id=province_id, birthday=birthday, description=description, gender=gender, userId=userId, cellphone=cellphone, email=email ) )
def pretty_print_search_song(self, search_song_name, offset=0, limit=30): ''' 格式化打印搜索一首歌返回的结果 :param search_song_name: 搜索歌曲的名字 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' # 调用搜索接口 res = self.netcloud_login.search(keyword=search_song_name, type_=1, offset=offset, limit=limit).json() # 搜索结果数量 num = len(res['result']['songs']) # search result num self.logger.info("Your search song name is:%s" % search_song_name) self.logger.info("Here is your search result(total %d):" % num) # 逐个打印搜索结果 for index, content in enumerate(res['result']['songs'], 1): self.logger.info("-" * 20 + " search result %d " % index + "-" * 20) # 歌曲名字 self.logger.info("song name:%s" % content['name']) # 歌曲别名 self.logger.info("alias:%s" % content['alias']) # 歌手名(注意可能有多个歌手) self.logger.info("singer:") for artist in content['artists']: self.logger.info(artist['name']) # 专辑名 self.logger.info("\nalbum:%s" % content['album']['name']) # 专辑发布时间(年月日) self.logger.info("album publish time:%s" % Helper.from_timestamp_to_date(content['album']['publishTime'] * 0.001, format="%Y-%m-%d")) # 歌曲时长 self.logger.info("song duration:%s m,%s s." % (content['duration'] // 60000,(content['duration'] // 1000 % 60))) # 歌曲id self.logger.info("song id:%s" % content["id"]) # 歌手id(可能有多个歌手) self.logger.info("singer id:") for artist in content["artists"]: self.logger.info(artist['id']) # 专辑 id self.logger.info("\nalbum id:%s" % content['album']['id']) # mv id self.logger.info("mv id:%s" % content["mvid"])
def load_all_users_url(self): ''' 从保存在磁盘的全部评论文件中, 提取返回所有用户主页url list ''' # list(dict) if not os.path.exists(self.comments_file_path): self.save_all_comments_to_file_by_multi_threading() comments_list = Helper.load_file_format_json(self.comments_file_path) # 全部用户id users_id = [comment[Constants.USER_ID_KEY] for comment in comments_list] # 全部用户数 ids_num = len(users_id) # 用户id必须是数字字符串的形式 users_id = [user_id for user_id in users_id if re.match(r'^\d+$',str(user_id))] users_url = [] for user_id in users_id: users_url.append('http://music.163.com/user/home?id={user_id}'.format(user_id = user_id)) # 去重 return list(set(users_url))
def pretty_print_search_user(self, keyword, offset=0, limit=30): ''' 格式化打印搜索用户的信息 :param keyword: 搜索关键字 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' res = self.netcloud_login.search(keyword, type_=1002, offset=offset, limit=limit).json() # 返回用户总数总数 num = len(res['result']["userprofiles"]) # 搜索用户关键字 self.logger.info("Your search user keyword is:%s" % keyword) self.logger.info("Here is your search result(%d count):" % num) for index, content in enumerate(res['result']['userprofiles'], 1): self.logger.info("-" * 20 + " search result %d " % index + "-" * 20) # 用户名 self.logger.info("user name:%s" % content['nickname']) # 用户签名 self.logger.info("user signature:%s" % content["signature"]) # 用户描述 self.logger.info("user description:%s" % content["description"]) # 用户具体描述 self.logger.info("user detail description:%s" % content["detailDescription"]) # 用户id self.logger.info("user id:%s" % content["userId"]) # 省份信息 self.logger.info("province id:%s" % content["province"]) # 城市信息 self.logger.info("city id:%s" % content["city"]) # 性别 self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female") # 生日 self.logger.info("birthday:%s" % Helper.from_timestamp_to_date(content["birthday"] * 0.001, "%Y-%m-%d")) # 头像url self.logger.info("avatar url:%s" % content["avatarUrl"]) # 背景图像url self.logger.info("background image url:%s" % content["backgroundUrl"])
def test_get_singer_hot_songs_ids(self): self.logger.info(Helper.get_singer_hot_songs_ids(self.singer_url))
def get_users_info_list(self,users_url = None,total_urls_num = None): ''' 获取一周歌曲下全部用户信息list :param users_url: 传入用户url list :param total_urls_num: 全部urls 数量,默认是None,不为None时,说明正在进行多线程调用 :return: list(dict) ''' users_info_list = [] if users_url is None: # 获取歌曲下全部用户url list users_url = self.load_all_users_url() num = len(users_url) # 遍历每个用户url for index, user_url in enumerate(users_url, 1): try: user_id = re.search(r'.*id=(\d+)', user_url).group(1) # 用户id # 抓取时间 crawler_time = Helper.from_timestamp_to_date(time_stamp=time.time()) # 获取html html = requests.get(user_url, headers=Constants.REQUEST_HEADERS).text # 动态总数 event_count_pattern = re.compile(r'<strong id="event_count">(\d+?)</strong>') event_count = re.search(event_count_pattern, html) if event_count: event_count = event_count.group(1) else: event_count = Constants.UNKNOWN_TOKEN # 用户关注数 follow_count_pattern = re.compile(r'<strong id="follow_count">(\d+?)</strong>') follow_count = re.search(follow_count_pattern, html) if follow_count: follow_count = follow_count.group(1) else: follow_count = Constants.UNKNOWN_TOKEN # 用户粉丝数 fan_count_pattern = re.compile(r'<strong id="fan_count">(\d+?)</strong>') fan_count = re.search(fan_count_pattern, html) if fan_count: fan_count = fan_count.group(1) else: fan_count = Constants.UNKNOWN_TOKEN # 用户所在地区 location_pattern = re.compile('<span>所在地区:(.+?)</span>') location = re.search(location_pattern, html) if location: location = location.group(1) else: location = Constants.UNKNOWN_TOKEN # 用户个人描述 description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>') description = re.search(description_pattern, html) if description: description = description.group(1) else: description = Constants.UNKNOWN_TOKEN # 用户年龄 age_pattern = re.compile(r'<span.*?data-age="(\d+)">') age = re.search(age_pattern, html) if age: age = age.group(1) # 时间戳形式 # 今年 current_year = int(Helper.from_timestamp_to_date(time_stamp=time.time(), format="%Y")) # 得到用户真实年龄 age = (current_year - 1970) - int(age) // (1000 * 365 * 24 * 3600) else: age = Constants.UNKNOWN_TOKEN # 累计听歌 listening_songs_num_pattern = re.compile('<h4>累积听歌(\d+?)首</h4>') listening_songs_num = re.search(listening_songs_num_pattern, html) if listening_songs_num: listening_songs_num = listening_songs_num.group(1) else: listening_songs_num = Constants.UNKNOWN_TOKEN # 将用户信息以json形式保存到磁盘 user_info_dict = { Constants.USER_ID_KEY: user_id, Constants.CRAWLER_TIME_KEY: crawler_time, Constants.EVENT_COUNT_KEY: event_count, Constants.FOLLOW_COUNT_KEY: follow_count, Constants.FAN_COUNT_KEY: fan_count, Constants.LOCATION_KEY: location, Constants.USER_DESCRIPTION_KEY: description, Constants.USER_AGE_KEY: age, Constants.LISTENING_SONGS_NUM_KEY: listening_songs_num } user_info_json_str = json.dumps(user_info_dict, ensure_ascii=False) users_info_list.append(user_info_json_str) if total_urls_num: # 多线程调用 if self.lock.acquire(): self.no_counter += 1 self.logger.info( "Write {current}/{total} user info to file successfully!".format(current=self.no_counter, total=total_urls_num)) self.lock.release() else: # 普通单线程调用 self.logger.info( "Write {current}/{total} user info to file successfully!".format(current=index, total=num)) except Exception as e: self.logger.error("Fail to get No.{index} comment user's info:{error}" .format(index=index, error=e)) return users_info_list
def core_visual_analyse(self): ''' 评论以及用户信息可视化,核心函数,使用pyecharts绘制 1. 评论时间的分布,包括月和天,柱状图 2. 赞同数分布,柱状图 3. 去除停用词之后评论关键词的分布,柱状图 4. 用户地理位置的分布,使用地图展示 5. 用户地理位置的分布,使用柱状图展示 6. 用户动态的分布,柱状图展示 7. 用户关注人数的分布,柱状图展示 8. 用户粉丝数的分布,柱状图展示 9. 去停用词之后用户个人描述关键词分布,柱状图 10. 用户年龄的分布,柱状图 11. 用户听歌总数分布,柱状图 ''' plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME) Helper.mkdir(plot_save_path) # 加载全部评论 comments_list = Helper.load_file_format_json(self.comments_file_path) # 加载全部用户信息 users_info_list = Helper.load_file_format_json(self.users_info_file_path) # 1.评论时间的分布, 包括月和天, 柱状图 comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list] # 年-月 格式的时间 comments_date_year_month = [] # 年-月-日 格式的时间 comments_date_year_month_day = [] for comment_time in comments_time: # 时间戳要除以1000得到实际的时间戳 year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m") year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d") comments_date_year_month.append(year_month) comments_date_year_month_day.append(year_month_day) self.save_sorted_bar_plot( datas = comments_date_year_month, label = "年-月", title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML) ) self.save_sorted_bar_plot( datas = comments_date_year_month_day, label = "年-月-日", title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name), key_index = 0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML) ) # 2. 赞同数分布,柱状图 liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas = liked_count_list, label = "点赞数量", title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML) ) # 3. 去除停用词之后评论关键词的分布,柱状图 comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list]) comments_keywords = Helper.cut_text(comments_text) # 移除长度小于2的词以及停用词 stopwords = Helper.load_stopwords() comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=comments_keywords, label="关键词", title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML), reverse=True ) # 4. 用户地理位置的分布,使用地图展示 users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list] users_city = [] # 用户所处城市 all_support_cities = Helper.load_echarts_support_cities() for location in users_location: for city in all_support_cities: if city in location: users_city.append(city) break users_city_data = list(Counter(users_city).items()) users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left", width=1200, height=600, background_color='#404a59') attr, value = users_city_geo.cast(users_city_data) users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True) users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML) Helper.check_file_exits_and_overwrite(users_city_save_path) users_city_geo.render(users_city_save_path) # 5.用户地理位置分布的柱状图展示 self.save_sorted_bar_plot( datas=users_location, label="用户所在地区", title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML), reverse=True ) # 6. 用户动态数量的分布,柱状图展示 events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=events_count_list, label="用户动态总数", title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML) ) # 7. 用户关注人数的分布,柱状图展示 follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=follow_count_list, label="用户关注人数", title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML) ) # 8. 用户粉丝数的分布,柱状图展示 fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=fan_count_list, label="用户粉丝人数", title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML) ) # 9. 去停用词之后用户个人描述关键词分布,柱状图 description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list]) description_keywords = Helper.cut_text(description_text) description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=description_keywords_list, label="用户简介关键词", title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML), reverse=True ) # 10. 用户年龄分布 age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN] age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0 self.save_sorted_bar_plot( datas=age_count_list, label="年龄", title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML) ) # 11. 累计听歌数量分布 listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN] # 听歌数量离散化(因为极差太大) listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0} for c in listening_songs_num_list: if c < 100: listening_songs_dict['0-100'] += 1 elif c < 1000: listening_songs_dict['100-1000'] += 1 elif c < 10000: listening_songs_dict['1000-10000'] += 1 else: listening_songs_dict['>10000'] += 1 self.save_sorted_bar_plot( datas=listening_songs_dict, label="听歌总数", title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML), reverse=True )
def __init__(self): self.logger = Helper.get_logger() # 无参数登录 self.login_printer = NetCloudPrinter() self.netcloud_login = NetCloudLogin()
def test_get_download_urls_by_ids(self): singer_url = "http://music.163.com/artist?id=9621" ids_list = Helper.get_singer_hot_songs_ids(singer_url) self.logger.info( self.netcloud_login.get_download_urls_by_ids(ids_list))
def test_get_songs_name_list_by_ids_list(self): singer_url = "http://music.163.com/artist?id=7214" ids_list = Helper.get_singer_hot_songs_ids(singer_url) self.logger.info( self.netcloud_login.get_songs_name_list_by_ids_list(ids_list))