def pretty_print_self_fans(self, offset=0, limit=30): ''' 格式化打印用户自身的粉丝信息 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' res = self.netcloud_login.get_self_fans(offset=offset, limit=limit).json() # 用户粉丝数 num = len(res['followeds']) self.logger.info("My fans list is(count %d):" % num) # 逐个打印我的粉丝信息 for index, content in enumerate(res['followeds'], 1): self.logger.info("-" * 20 + " fans %d " % index + "-" * 20) # 用户名 self.logger.info("user name:%s" % content["nickname"]) # 用户 id self.logger.info("user id:%s" % content["userId"]) # 用户签名 self.logger.info("user signature:%s" % content["signature"]) # 用户性别 self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female") # 头像地址 self.logger.info("avatar url:%s" % content["avatarUrl"]) # 歌单数量 self.logger.info("play list count:%s" % content["playlistCount"]) # 动态数量 self.logger.info("event count:%s" % content["eventCount"]) # 粉丝数量 self.logger.info("fans count:%s" % content["followeds"]) # 关注的人数 self.logger.info("follows count:%s" % content["follows"]) # 粉丝关注当前用户的信息(年月日) self.logger.info("follow time:%s" % Helper.from_timestamp_to_date(content["time"] * 0.001, "%Y-%m-%d"))
def pretty_print_self_info(self): ''' 格式化打印个人信息 :return: ''' info_dict = self.netcloud_login.login().json() avatarUrl = info_dict['profile']['avatarUrl'] # 头像地址 signature = info_dict['profile']['signature'] # 个性签名 nickname = info_dict['profile']['nickname'] # 昵称 userName = info_dict['account']['userName'] # 用户名 province_id = info_dict['profile']['province'] # 省份信息 birthday_no = info_dict['profile']['birthday'] # 生日 if birthday_no < 0: birthday = "unknown" else: birthday = Helper.from_timestamp_to_date(time_stamp=birthday_no * 0.001, format="%Y-%m-%d") description = info_dict['profile']['description'] if info_dict['profile']['gender'] == 1: gender = 'male' elif info_dict['profile']['gender'] == 0: gender = 'female' else: gender = 'unknown' userId = info_dict['profile']['userId'] cellphone = json.loads(info_dict['bindings'][0]['tokenJsonStr'])['cellphone'] # 手机号 email = json.loads(info_dict['bindings'][1]['tokenJsonStr'])['email'] # 邮箱 self.logger.info("Hello,{nickname}!\nHere is your personal info:".format(nickname=nickname)) self.logger.info("avatarUrl:{avatarUrl}\nsignature:{signature}\n" "nickname:{nickname}\n" "userName:{userName}\nprovince_id:{province_id}\n" "birthday:{birthday}\ndescription:{description}\n" "gender:{gender}\nuserId:{userId}\n" "cellphone:{cellphone}\nemail:{email}\n".format( avatarUrl=avatarUrl, signature=signature, nickname=nickname, userName=userName, province_id=province_id, birthday=birthday, description=description, gender=gender, userId=userId, cellphone=cellphone, email=email ) )
def pretty_print_search_song(self, search_song_name, offset=0, limit=30): ''' 格式化打印搜索一首歌返回的结果 :param search_song_name: 搜索歌曲的名字 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' # 调用搜索接口 res = self.netcloud_login.search(keyword=search_song_name, type_=1, offset=offset, limit=limit).json() # 搜索结果数量 num = len(res['result']['songs']) # search result num self.logger.info("Your search song name is:%s" % search_song_name) self.logger.info("Here is your search result(total %d):" % num) # 逐个打印搜索结果 for index, content in enumerate(res['result']['songs'], 1): self.logger.info("-" * 20 + " search result %d " % index + "-" * 20) # 歌曲名字 self.logger.info("song name:%s" % content['name']) # 歌曲别名 self.logger.info("alias:%s" % content['alias']) # 歌手名(注意可能有多个歌手) self.logger.info("singer:") for artist in content['artists']: self.logger.info(artist['name']) # 专辑名 self.logger.info("\nalbum:%s" % content['album']['name']) # 专辑发布时间(年月日) self.logger.info("album publish time:%s" % Helper.from_timestamp_to_date(content['album']['publishTime'] * 0.001, format="%Y-%m-%d")) # 歌曲时长 self.logger.info("song duration:%s m,%s s." % (content['duration'] // 60000,(content['duration'] // 1000 % 60))) # 歌曲id self.logger.info("song id:%s" % content["id"]) # 歌手id(可能有多个歌手) self.logger.info("singer id:") for artist in content["artists"]: self.logger.info(artist['id']) # 专辑 id self.logger.info("\nalbum id:%s" % content['album']['id']) # mv id self.logger.info("mv id:%s" % content["mvid"])
def pretty_print_search_user(self, keyword, offset=0, limit=30): ''' 格式化打印搜索用户的信息 :param keyword: 搜索关键字 :param offset: 起始位置 :param limit: 最高返回数量 :return: ''' res = self.netcloud_login.search(keyword, type_=1002, offset=offset, limit=limit).json() # 返回用户总数总数 num = len(res['result']["userprofiles"]) # 搜索用户关键字 self.logger.info("Your search user keyword is:%s" % keyword) self.logger.info("Here is your search result(%d count):" % num) for index, content in enumerate(res['result']['userprofiles'], 1): self.logger.info("-" * 20 + " search result %d " % index + "-" * 20) # 用户名 self.logger.info("user name:%s" % content['nickname']) # 用户签名 self.logger.info("user signature:%s" % content["signature"]) # 用户描述 self.logger.info("user description:%s" % content["description"]) # 用户具体描述 self.logger.info("user detail description:%s" % content["detailDescription"]) # 用户id self.logger.info("user id:%s" % content["userId"]) # 省份信息 self.logger.info("province id:%s" % content["province"]) # 城市信息 self.logger.info("city id:%s" % content["city"]) # 性别 self.logger.info("gender:%s" % "male" if content["gender"] == 1 else "female") # 生日 self.logger.info("birthday:%s" % Helper.from_timestamp_to_date(content["birthday"] * 0.001, "%Y-%m-%d")) # 头像url self.logger.info("avatar url:%s" % content["avatarUrl"]) # 背景图像url self.logger.info("background image url:%s" % content["backgroundUrl"])
def get_users_info_list(self,users_url = None,total_urls_num = None): ''' 获取一周歌曲下全部用户信息list :param users_url: 传入用户url list :param total_urls_num: 全部urls 数量,默认是None,不为None时,说明正在进行多线程调用 :return: list(dict) ''' users_info_list = [] if users_url is None: # 获取歌曲下全部用户url list users_url = self.load_all_users_url() num = len(users_url) # 遍历每个用户url for index, user_url in enumerate(users_url, 1): try: user_id = re.search(r'.*id=(\d+)', user_url).group(1) # 用户id # 抓取时间 crawler_time = Helper.from_timestamp_to_date(time_stamp=time.time()) # 获取html html = requests.get(user_url, headers=Constants.REQUEST_HEADERS).text # 动态总数 event_count_pattern = re.compile(r'<strong id="event_count">(\d+?)</strong>') event_count = re.search(event_count_pattern, html) if event_count: event_count = event_count.group(1) else: event_count = Constants.UNKNOWN_TOKEN # 用户关注数 follow_count_pattern = re.compile(r'<strong id="follow_count">(\d+?)</strong>') follow_count = re.search(follow_count_pattern, html) if follow_count: follow_count = follow_count.group(1) else: follow_count = Constants.UNKNOWN_TOKEN # 用户粉丝数 fan_count_pattern = re.compile(r'<strong id="fan_count">(\d+?)</strong>') fan_count = re.search(fan_count_pattern, html) if fan_count: fan_count = fan_count.group(1) else: fan_count = Constants.UNKNOWN_TOKEN # 用户所在地区 location_pattern = re.compile('<span>所在地区:(.+?)</span>') location = re.search(location_pattern, html) if location: location = location.group(1) else: location = Constants.UNKNOWN_TOKEN # 用户个人描述 description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>') description = re.search(description_pattern, html) if description: description = description.group(1) else: description = Constants.UNKNOWN_TOKEN # 用户年龄 age_pattern = re.compile(r'<span.*?data-age="(\d+)">') age = re.search(age_pattern, html) if age: age = age.group(1) # 时间戳形式 # 今年 current_year = int(Helper.from_timestamp_to_date(time_stamp=time.time(), format="%Y")) # 得到用户真实年龄 age = (current_year - 1970) - int(age) // (1000 * 365 * 24 * 3600) else: age = Constants.UNKNOWN_TOKEN # 累计听歌 listening_songs_num_pattern = re.compile('<h4>累积听歌(\d+?)首</h4>') listening_songs_num = re.search(listening_songs_num_pattern, html) if listening_songs_num: listening_songs_num = listening_songs_num.group(1) else: listening_songs_num = Constants.UNKNOWN_TOKEN # 将用户信息以json形式保存到磁盘 user_info_dict = { Constants.USER_ID_KEY: user_id, Constants.CRAWLER_TIME_KEY: crawler_time, Constants.EVENT_COUNT_KEY: event_count, Constants.FOLLOW_COUNT_KEY: follow_count, Constants.FAN_COUNT_KEY: fan_count, Constants.LOCATION_KEY: location, Constants.USER_DESCRIPTION_KEY: description, Constants.USER_AGE_KEY: age, Constants.LISTENING_SONGS_NUM_KEY: listening_songs_num } user_info_json_str = json.dumps(user_info_dict, ensure_ascii=False) users_info_list.append(user_info_json_str) if total_urls_num: # 多线程调用 if self.lock.acquire(): self.no_counter += 1 self.logger.info( "Write {current}/{total} user info to file successfully!".format(current=self.no_counter, total=total_urls_num)) self.lock.release() else: # 普通单线程调用 self.logger.info( "Write {current}/{total} user info to file successfully!".format(current=index, total=num)) except Exception as e: self.logger.error("Fail to get No.{index} comment user's info:{error}" .format(index=index, error=e)) return users_info_list
def core_visual_analyse(self): ''' 评论以及用户信息可视化,核心函数,使用pyecharts绘制 1. 评论时间的分布,包括月和天,柱状图 2. 赞同数分布,柱状图 3. 去除停用词之后评论关键词的分布,柱状图 4. 用户地理位置的分布,使用地图展示 5. 用户地理位置的分布,使用柱状图展示 6. 用户动态的分布,柱状图展示 7. 用户关注人数的分布,柱状图展示 8. 用户粉丝数的分布,柱状图展示 9. 去停用词之后用户个人描述关键词分布,柱状图 10. 用户年龄的分布,柱状图 11. 用户听歌总数分布,柱状图 ''' plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME) Helper.mkdir(plot_save_path) # 加载全部评论 comments_list = Helper.load_file_format_json(self.comments_file_path) # 加载全部用户信息 users_info_list = Helper.load_file_format_json(self.users_info_file_path) # 1.评论时间的分布, 包括月和天, 柱状图 comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list] # 年-月 格式的时间 comments_date_year_month = [] # 年-月-日 格式的时间 comments_date_year_month_day = [] for comment_time in comments_time: # 时间戳要除以1000得到实际的时间戳 year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m") year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d") comments_date_year_month.append(year_month) comments_date_year_month_day.append(year_month_day) self.save_sorted_bar_plot( datas = comments_date_year_month, label = "年-月", title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML) ) self.save_sorted_bar_plot( datas = comments_date_year_month_day, label = "年-月-日", title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name), key_index = 0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML) ) # 2. 赞同数分布,柱状图 liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas = liked_count_list, label = "点赞数量", title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML) ) # 3. 去除停用词之后评论关键词的分布,柱状图 comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list]) comments_keywords = Helper.cut_text(comments_text) # 移除长度小于2的词以及停用词 stopwords = Helper.load_stopwords() comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=comments_keywords, label="关键词", title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML), reverse=True ) # 4. 用户地理位置的分布,使用地图展示 users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list] users_city = [] # 用户所处城市 all_support_cities = Helper.load_echarts_support_cities() for location in users_location: for city in all_support_cities: if city in location: users_city.append(city) break users_city_data = list(Counter(users_city).items()) users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left", width=1200, height=600, background_color='#404a59') attr, value = users_city_geo.cast(users_city_data) users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True) users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML) Helper.check_file_exits_and_overwrite(users_city_save_path) users_city_geo.render(users_city_save_path) # 5.用户地理位置分布的柱状图展示 self.save_sorted_bar_plot( datas=users_location, label="用户所在地区", title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML), reverse=True ) # 6. 用户动态数量的分布,柱状图展示 events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=events_count_list, label="用户动态总数", title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML) ) # 7. 用户关注人数的分布,柱状图展示 follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=follow_count_list, label="用户关注人数", title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML) ) # 8. 用户粉丝数的分布,柱状图展示 fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=fan_count_list, label="用户粉丝人数", title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML) ) # 9. 去停用词之后用户个人描述关键词分布,柱状图 description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list]) description_keywords = Helper.cut_text(description_text) description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=description_keywords_list, label="用户简介关键词", title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML), reverse=True ) # 10. 用户年龄分布 age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN] age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0 self.save_sorted_bar_plot( datas=age_count_list, label="年龄", title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML) ) # 11. 累计听歌数量分布 listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN] # 听歌数量离散化(因为极差太大) listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0} for c in listening_songs_num_list: if c < 100: listening_songs_dict['0-100'] += 1 elif c < 1000: listening_songs_dict['100-1000'] += 1 elif c < 10000: listening_songs_dict['1000-10000'] += 1 else: listening_songs_dict['>10000'] += 1 self.save_sorted_bar_plot( datas=listening_songs_dict, label="听歌总数", title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML), reverse=True )
def pretty_print_user_play_list(self, uid, offset=0, limit=1000): ''' 格式化打印用户播放歌单 :param uid: 用户id :param offset: 起始位置 :param limit: 最大数量限制 :return: ''' play_list = self.netcloud_login.get_user_play_list(uid, offset, limit).json() num = len(play_list['playlist']) # 歌单数量 self.logger.info("UserId {UserId} has total {total} play list!".format(UserId=uid, total=num)) # 循环打印歌单内容 for i in range(num): playlist_dict = play_list['playlist'][i] self.logger.info("-" * 20 + " play list {index} ".format(index=i + 1) + "-" * 20) # 歌单创建时间 createTime = Helper.from_timestamp_to_date(playlist_dict['createTime'] * 0.001, format="%Y-%m-%d") # 歌单更新时间 updateTime = Helper.from_timestamp_to_date(playlist_dict['updateTime'] * 0.001, format="%Y-%m-%d") # 标签 tags_str = ",".join(playlist_dict['tags']) # 歌单描述 description = playlist_dict['description'] # 封面url coverImgUrl = playlist_dict['coverImgUrl'] # 创建用户id creator_user_id = playlist_dict['userId'] # 创建用户昵称 creator_user_nickname = playlist_dict['creator']['nickname'] # 创建用户性别 creator_user_gender = playlist_dict['creator']['gender'] if creator_user_gender == 1: creator_user_gender = "male" elif creator_user_gender == 0: creator_user_gender = "female" else: creator_user_gender = "unknown" # 创建用户签名 creator_user_signature = playlist_dict['creator']['signature'] # 创建用户描述 creator_user_descrition = playlist_dict['creator']['description'] # 创建用户的描述详情 creator_user_detailDescription = playlist_dict['creator']['detailDescription'] # 创建用户的city creator_user_city_no = playlist_dict['creator']['city'] # 创建用户头像url creator_user_avatarUrl = playlist_dict['creator']['avatarUrl'] # 创建用户省份 creator_user_province_no = playlist_dict['creator']['province'] # 背景url backgroundUrl = playlist_dict['creator']['backgroundUrl'] creator_user_birthday_no = playlist_dict['creator']['birthday'] # 创建用户的生日 if creator_user_birthday_no < 0: creator_user_birthday = "unknown" else: creator_user_birthday = Helper.from_timestamp_to_date(creator_user_birthday_no * 0.001, format="%Y-%m-%d") # 艺术家 artists = playlist_dict['artists'] # 歌单名字 playlist_name = playlist_dict['name'] # 是否是高质量 highQuality = playlist_dict['highQuality'] # 歌单id playlist_id = playlist_dict['id'] # 播放次数 playCount = playlist_dict['playCount'] # 是否匿名 anonimous = playlist_dict['anonimous'] # 音乐总数 music_count = playlist_dict['trackCount'] # 格式化打印信息 self.logger.info("play list name:{playlist_name}\ntags:{tags_str}\n" "high quality:{highQuality}\n" "description:{description}\nplay list cover image url:{coverImgUrl}\n" "create time:{createTime}\nupdate time:{updateTime}\n" "playlist id:{playlist_id}\n" "play count:{playCount}\n" "music count:{music_count}\n" "anonimous:{anonimous}\n" "creator user id:{creator_user_id}\ncreator user nickname:{creator_user_nickname}\n" "creator user gender:{creator_user_gender}\ncreator user signature:{creator_user_signature}\n" "creator user descrition:{creator_user_descrition}\n" "creator user detailDescription:{creator_user_detailDescription}\n" "creator user province no:{creator_user_province_no}\n" "creator user city no:{creator_user_city_no}\n" "creator user avatarUrl:{creator_user_avatarUrl}\n" "background url:{backgroundUrl}\n" "creator user birthday:{creator_user_birthday}\n" "artists:{artists}\n".format( playlist_name=playlist_name, tags_str=tags_str, highQuality=highQuality, description=description, coverImgUrl=coverImgUrl, createTime=createTime, updateTime=updateTime, playlist_id=playlist_id, playCount=playCount, music_count=music_count, anonimous=anonimous, creator_user_id=creator_user_id, creator_user_nickname=creator_user_nickname, creator_user_gender=creator_user_gender, creator_user_signature=creator_user_signature, creator_user_descrition=creator_user_descrition, creator_user_detailDescription=creator_user_detailDescription, creator_user_province_no=creator_user_province_no, creator_user_city_no=creator_user_city_no, creator_user_avatarUrl=creator_user_avatarUrl, backgroundUrl=backgroundUrl, creator_user_birthday=creator_user_birthday, artists=artists ) )