def audio_crawler(path='songs'): """ 音频爬取函数 :param path: 提供自定义下载路径修改 :return: None """ # 规定基础路径 base_dir = dirname(__file__) + "/" + path + "/" # 如果路径不存在则创建路径 if not exists(base_dir): mkdir(base_dir) # 从12032-20000遍历sid,生成专辑url for sid in range(12032, 20000): # 拼接专辑url url = "https://www.bilibili.com/audio/music-service-c/web/song/of-menu?sid={}&pn=1&ps=100".format(sid) res = url_get(url=url, mode="json") data = dict_get(res, "data") # 如果data为空,则跳过 if data is None: continue items = dict_get(data, "data") # 获取专辑信息请求 info_url = "https://www.bilibili.com/audio/music-service-c/web/menu/info?sid={}".format(sid) info_get = url_get(url=info_url, mode="json") album_title = dict_get(info_get, "title").replace("/", '').replace("<", '').replace(">", '').replace( "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') # 如果路径不存在则创建路径 if not exists(base_dir + album_title): mkdir(base_dir + album_title) # 遍历专辑下所有音乐 for item in items: author = dict_get(item, "author") # 歌手 title = dict_get(item, "title") # 音乐标题 sid = dict_get(item, "id") # 音乐id,用于拼接音乐下载url songs_url = "https://www.bilibili.com/audio/music-service-c/web/url?sid={}".format(sid) songs_get = url_get(url=songs_url, mode="json") file_size = round(dict_get(songs_get, "size") / 1024 / 1024, 2) # 音频文件大小 # 分析json中cdns数据,判断音频文件真实地址 cdns = dict_get(songs_get, "cdns") if cdns[0] > cdns[1]: real_url = cdns[0] else: real_url = cdns[1] print("Downloading Audio") song_file_name = base_dir + album_title + "/" + title + " - " + author + '.m4a' # 如果文件已存在,则跳过 if exists(song_file_name): continue # 下载音频文件 song_file_get = url_get(url=real_url, mode="content") with open(song_file_name, "wb") as song: song.write(song_file_get) song.close() # 显示进程信息 print("album_title: {}".format(album_title)) print("author: {}".format(author)) print("title: {}".format(title)) print("file_size: {} MB".format(file_size)) print("-" * 60)
def game_crawler(): database = Database("localhost", "root", "", "bilibili") table_name = "game_list" game_list_url = "https://game.bilibili.com/gamelist.json" game_list_json = url_get(game_list_url, "json") for game in game_list_json: game_info = {} game_info['name'] = dict_get(game, "title") game_info['summary'] = dict_get(game, "summary") game_info['website'] = dict_get(game, "website") if database.execute_sql(table_name=table_name, key="name", value=game_info['name']) != 0: print("{} 重复,跳过".format(game_info['name'])) print("-" * 60) if database.execute_sql(table_name=table_name, mode="insert", keys=list(game_info.keys()), values=list(game_info.values())): print("游戏名: {}".format(game_info['name'])) print("游戏介绍: {}".format(game_info['summary'])) print("游戏官网: {}".format(game_info['website'])) print("-" * 60)
def crawler(av): database = Database(host="localhost", username="******", password="", db_name="bilibili") for av_num in range(av, 48544470): url = "https://api.bilibili.com/x/web-interface/view?aid={}".format( av_num) get_json = url_get(url=url, mode="json") if dict_get(get_json, "code") != 0: print('错误!没有此视频!av:{}'.format(av_num)) print('-' * 60) continue data = {} data['video_av'] = str(av_num) data['video_up'] = dict_get(get_json, "name") data['video_title'] = dict_get(get_json, "title") data['video_classification'] = dict_get(get_json, "tname") data['video_view'] = dict_get(get_json, "view") data['video_share'] = dict_get(get_json, "share") data['video_like'] = dict_get(get_json, "like") data['video_favorite'] = dict_get(get_json, "favorite") data['video_coin'] = dict_get(get_json, "coin") data['video_update'] = strftime("%Y-%m-%d %H:%M:%S", gmtime(dict_get(get_json, "ctime"))) data['video_reply'] = dict_get(get_json, "reply") data['video_danmaku'] = dict_get(get_json, "danmaku") video_reprint = dict_get(get_json, "no_reprint") if video_reprint == 0: data['video_reprint'] = "转载" else: data['video_reprint'] = "原创" db_select = database.execute_sql(table_name="video", mode="search", key="video_av", value=data['video_av']) if db_select != 0: print('错误!此视频已存在!av:{}'.format(av_num)) print('-' * 60) else: if database.execute_sql(table_name="video", mode="insert", keys=list(data.keys()), values=list(data.values())): print("视频av号: {}".format(data['video_av'])) print("作者: {}".format(data['video_up'])) print("标题: {}".format(data['video_title'])) print("视频分类: {}".format(data['video_classification'])) print("观看数: {}".format(data['video_view'])) print("分享数: {}".format(data['video_share'])) print("点赞数: {}".format(data['video_like'])) print("收藏数: {}".format(data['video_favorite'])) print("投币数: {}".format(data['video_coin'])) print("上传时间: {}".format(data['video_update'])) print("评论数: {}".format(data['video_reply'])) print("弹幕数: {}".format(data['video_danmaku'])) print("性质: {}".format(data['video_reprint'])) print("-" * 60)
def crawler(type_, sort, path='save_picture', page_num=0): """ :param type_: 分类 --> cos or sifu :param sort: 排序 --> hot or new :param path: 路径(当前目录下) :param page_num: 开始页,默认0页开始 """ if path != '' and not exists(path): mkdir(path) base_dir = dirname(__file__) + "/" + path + "/" url = "https://api.vc.bilibili.com/link_draw/v2/Photo/list?category={}&type={}&page_num={}&page_size=20".format( type_, sort, page_num) res = url_get(url=url, mode="json") items = dict_get(res, "items") if len(items) == 0: print("Current page have no any picture, Exit mission!") return for i in items: title = dict_get(i, "title") # 相簿标题 up = dict_get(i, "name") # up主 directory_name = title.replace("/", '').replace("<", '').replace( ">", '').replace("|", '').replace(":", '').replace( "*", '').replace("?", '').replace("\\", '') + "-" + up if not exists(path + "/" + directory_name): mkdir(path + "/" + directory_name) picture_list = [] # 存放图片地址 for picture in dict_get(i, "pictures"): picture_list.append(picture['img_src']) print("Downloading Pictures") for pic in picture_list: pic_name = pic.split("/")[-1] full_pic_path = base_dir + directory_name + "/" + pic_name if not exists(full_pic_path): pic_get = url_get(url=pic, mode="content") with open(full_pic_path, "wb") as pic_file: pic_file.write(pic_get) else: continue print("current page: {}".format(page_num + 1)) print("title: {}".format(title)) print("up: {}".format(up)) print("picture: {}".format(len(picture_list))) print("-" * 60) crawler(type_=type_, sort=sort, path=path, page_num=page_num + 1)
def column_crawler(): database = Database("localhost", "root", "", "bilibili") table_name = "zhuanlan" cid_dict = { "动画": 2, "游戏": 1, "影视": 28, "生活": 3, "兴趣": 29, "轻小说": 16, "科技": 17, } for v in cid_dict.values(): pn = 1 while True: column_url = "https://api.bilibili.com/x/article/recommends?cid={}&pn={}&ps=100&sort=0".format( v, pn) column_get = url_get(column_url, mode="json") column_data = dict_get(column_get, "data") if len(column_data) == 0: print(pn) break for item in column_data: data = {} author_info = dict_get(item, "author") # 作者信息 data['author_mid'] = author_info['mid'] # 作者id data['author_name'] = author_info['name'] # 作者用户名 data['category'] = dict_get(item, "category")['name'] # 所属分类 data['update_time'] = strftime( "%Y-%m-%d %H:%M:%S", localtime(dict_get(item, 'update_time'))) # 上传时间 data['art_id'] = dict_get( item, "id" ) # 文章id,如果需要爬取文章内容可以拼接url: https://www.bilibili.com/read/cv[文章id] data['art_title'] = dict_get(item, "title") # 文章标题 data['art_words'] = dict_get(item, "words") # 文章字数 data['art_like'] = dict_get(item, "like") # 文章点赞数 data['art_reply'] = dict_get(item, "reply") # 文章评论数 data['art_view'] = dict_get(item, "view") # 文章浏览数 data['art_favorite'] = dict_get(item, "favorite") # 文章收藏数 data['art_coin'] = dict_get(item, "coin") # 文章投币数 data['art_share'] = dict_get(item, "share") # 文章分享数 data['art_summary'] = dict_get(item, "summary") # 文章摘要 data['crawl_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime()) # 爬取时间 if database.execute_sql(table_name=table_name, select="id", key="art_id", value=data['art_id']) != 0: print("id:{} 重复,跳过".format(data['art_id'])) print("-" * 60) # pn += 1 continue if database.execute_sql(table_name=table_name, mode="insert", keys=list(data.keys()), values=list(data.values())): print("作者id: {}".format(data['author_mid'])) print("作者用户名: {}".format(data['author_name'])) print("所属分类: {}".format(data['category'])) print("上传时间: {}".format(data['update_time'])) print("文章id: {}".format(data['art_id'])) print("文章标题: {}".format(data['art_title'])) print("文章字数: {}".format(data['art_words'])) print("文章点赞数: {}".format(data['art_like'])) print("文章评论数: {}".format(data['art_reply'])) print("文章浏览数: {}".format(data['art_view'])) print("文章收藏数: {}".format(data['art_favorite'])) print("文章投币数: {}".format(data['art_coin'])) print("文章分享数: {}".format(data['art_share'])) print("文章摘要: {}".format(data['art_summary'])) print("爬取时间: {}".format(data['crawl_time'])) print("-" * 60) else: print("id:{} 异常,跳过".format(data['art_id'])) print("-" * 60) # pn += 1 continue pn += 1
def member_crawler(mid): database = Database("localhost", "root", "", "bilibili") while True: follow_url = "https://api.bilibili.com/x/relation/stat?vmid={}".format( mid) view_url = "https://api.bilibili.com/x/space/upstat?mid={}".format(mid) info_url = "https://api.bilibili.com/x/space/acc/info?mid={}".format( mid) tag_url = "https://space.bilibili.com/ajax/member/getTags?mids={}".format( mid) charging_url = "https://elec.bilibili.com/api/query.rank.do?mid={}".format( mid) upload_data_url = "https://api.bilibili.com/x/space/navnum?mid={}".format( mid) try: member_info = url_get(info_url, mode='json') username = dict_get(member_info, "name") if username is None: print("该会员不存在, 跳过 {}".format(mid)) print("-" * 60) mid += 1 continue level = dict_get(member_info, "level") member_id = dict_get(member_info, "mid") sex = dict_get(member_info, "sex") coins = dict_get(member_info, "coins") official_data = dict_get(member_info, "official") follow_data = url_get(follow_url, mode="json") following = dict_get(follow_data, 'following') follower = dict_get(follow_data, 'follower') view = dict_get(url_get(view_url, mode="json"), "view") if official_data['role'] == 1: official = official_data['title'] else: official = "暂无认证" birthday = dict_get(member_info, "birthday") sign = dict_get(member_info, "sign") vip = dict_get(member_info, "status") if vip == 1: vip_status = "是" else: vip_status = "否" tag = '' for x in dict_get(url_get(tag_url, mode="json"), "tags"): tag += x + ' ' charging = dict_get(url_get(charging_url, mode="json"), "total_count") video_upload = dict_get(url_get(upload_data_url, mode="json"), "video") if database.execute_sql(table_name="member", mode="search", key="member_id", value=member_id) != 0: print("该会员已存在, 跳过 {}".format(member_id)) print("-" * 60) mid += 1 continue insert_data = { "member_id": member_id, "username": username, "sex": sex, "birthday": birthday, "level": level, "coins": coins, "sign": sign, "charging": charging, "video_upload": video_upload, "tag": tag, "vip_status": vip_status, "official": official, "following": following, "follower": follower, "view": view, } if database.execute_sql(mode="insert", table_name="member", keys=list(insert_data.keys()), values=list(insert_data.values())): print("用户id: {}".format(member_id)) print("用户名: {}".format(username)) print("性别: {}".format(sex)) print("生日: {}".format(birthday)) print("等级: {}".format(level)) print("B币: {}".format(coins)) print("个人签名: {}".format(sign)) print("充电人数: {}".format(charging)) print("视频数量: {}".format(video_upload)) print("标签: {}".format(tag)) print("B站大会员: {}".format(vip_status)) print("Bilibili认证: {}".format(official)) print("关注数: {}".format(following)) print("粉丝数: {}".format(follower)) print("播放量: {}".format(view)) print("-" * 60) mid += 1 except Exception as e: print("错误, 跳过 mid={}".format(mid)) print(e) print("-" * 60) mid += 1 continue
def rank_crawler(): # 保存目录 save_path = "rank" # 如果目录不存在则创建目录 if not exists(save_path): mkdir(save_path) # rid字典 rid_dict = { "全站": 0, "动画": 1, "国创相关": 168, "音乐": 3, "舞蹈": 129, "游戏": 4, "科技": 36, "数码": 188, "生活": 160, "鬼畜": 119, "时尚": 155, "娱乐": 5, "影视": 181, } # 排行时间字典 day_dict = { "日排行": 1, "三日排行": 3, "周排行": 7, "月排行": 30, } # 遍历rid字典 for k, v in rid_dict.items(): rid = v # 遍历排行时间字典 for k2, v2 in day_dict.items(): day = v2 # 拼接url url = "https://api.bilibili.com/x/web-interface/ranking?rid={}&day={}".format(rid, day) res = url_get(url=url, mode="json") rank_list = dict_get(res, "list") for i in range(len(rank_list)): aid = dict_get(rank_list[i], "aid") # 视频id author = dict_get(rank_list[i], "author") # up主 coins = dict_get(rank_list[i], "coins") # 投币数 play = dict_get(rank_list[i], "play") # 播放数 pts = dict_get(rank_list[i], "pts") # 综合得分 title = dict_get(rank_list[i], "title") # 视频标题 video_review = dict_get(rank_list[i], "video_review") # 视频弹幕数(?) no_reprint = dict_get(rank_list[i], "no_reprint") if no_reprint == 1: # 判断是否原创 reprint = "原创" else: reprint = "转载" # 将数据保存到txt文件中,也可以导入functions.database包将数据保存到数据库中 with open("{}/Bilibili-{}-{}.txt".format(save_path, k, k2), "a+", encoding="utf-8") as data_file: data_file.write("排名: {}\n".format(i + 1)) data_file.write("视频id: {}\n".format(aid)) data_file.write("up主: {}\n".format(author)) data_file.write("投币数: {}\n".format(coins)) data_file.write("播放数: {}\n".format(play)) data_file.write("综合得分: {}\n".format(pts)) data_file.write("视频标题: {}\n".format(title)) data_file.write("视频弹幕数: {}\n".format(video_review)) data_file.write("是否原创: {}\n".format(reprint)) data_file.write("-" * 60 + "\n") data_file.close() # 打印进程显示 print("排名: {}".format(i + 1)) print("视频id: {}".format(aid)) print("up主: {}".format(author)) print("投币数: {}".format(coins)) print("播放数: {}".format(play)) print("综合得分: {}".format(pts)) print("视频标题: {}".format(title)) print("视频弹幕数: {}".format(video_review)) print("是否原创: {}".format(reprint)) print("-" * 60)
def micro_video_crawler(order='', page_num=1): """ :param order: 排序方式,new为按照视频上传时间排序,默认为系统推荐 """ database = Database("localhost", "root", "", "bilibili") table_name = "micro_video" classification = [] # 获取所有分类 classification_url = "https://api.vc.bilibili.com/clip/v1/video/zonelist?page=total" classification_json = url_get(classification_url, "json") classification_data = dict_get(classification_json, "data") for i in classification_data: if classification_data[i] == '': continue for j in classification_data[i]['tags']: classification.append(j) for tag in classification: ps = 50 # page_size最大50 pn = page_num # 开始页,调用时可自定义 while True: next_offset = (pn - 1) * ps micro_video_url = "https://api.vc.bilibili.com/clip/v1/video/search?" \ "page_size={}&need_playurl=0&next_offset={}&order={}" \ "&tag={}".format(ps, next_offset, order, tag) micro_video_json = url_get(micro_video_url, "json") items = dict_get(micro_video_json, "items") if len(items) == 0: break for item in items: video_info = {"tag": tag} video_info['title'] = dict_get(item, "description").replace("\n", "") # 视频标题 video_info['video_id'] = dict_get(item, "id") # 视频id video_info['reply'] = dict_get(item, "reply") # 视频评论数 video_info['upload_time'] = dict_get(item, "upload_time") # 视频上传时间 video_info['video_size'] = round(float(dict_get(item, "video_size")) / 1024**2, 2) # 视频文件大小,单位mb(float) video_info['video_time'] = dict_get(item, "video_time") # 视频时长,单位s video_info['video_playurl'] = dict_get(item, "video_playurl") # 视频播放地址 video_info['watched_num'] = dict_get(item, "watched_num") # 视频播放数 video_info['name'] = dict_get(item, "name") # 上传者用户名 video_info['uid'] = dict_get(item, "uid") # 上传者uid # 如果需要下载视频,请把下面注释去掉 # video_content = url_get(video_info['video_playurl'], "content") # 获取视频内容 # video_file_name = video_info['title'][:30].replace("/", '').replace("<", '').replace(">", '').replace( # "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + ".mp4" # 拼接视频文件名 # # 保存视频 # with open(video_file_name, "wb") as video_file: # video_file.write(video_content) # video_file.close() # 如果不需要插入数据库,请把下面部分注释掉 if database.execute_sql(table_name=table_name, key="video_id", value=video_info['video_id']) != 0: print("视频id:{} 重复,跳过".format(video_info['video_id'])) print("-" * 60) continue if database.execute_sql(table_name=table_name, mode="insert", keys=list(video_info.keys()), values=list(video_info.values())): print("视频标题: {}".format(video_info['title'])) print("视频id: {}".format(video_info['video_id'])) print("视频评论数: {}".format(video_info['reply'])) print("视频上传时间: {}".format(video_info['upload_time'])) print("视频大小(mb): {}".format(video_info['video_size'])) print("视频时长: {}".format(video_info['video_time'])) print("视频播放地址: {}".format(video_info['video_playurl'])) print("视频观看数: {}".format(video_info['watched_num'])) print("上传者用户名: {}".format(video_info['name'])) print("上传者id: {}".format(video_info['uid'])) print("-" * 60) pn += 1