def get_videos(url): """ 从 url 中获取视频列表 """ videos = [] season_id = re.search( r'"param":{"season_id":(\d+),"season_type":\d+}', spider.get(url).text).group(1) info_url = info_api.format(season_id=season_id) res = spider.get(info_url) for i, item in enumerate(res.json()["result"]["main_section"]["episodes"]): index = item["title"] if re.match(r'^\d*\.?\d*$', index): index = '第{}话'.format(index) name = repair_filename(' '.join([index, item["long_title"]])) file_path = os.path.join(CONFIG['video_dir'], repair_filename( '{}.mp4'.format(name))) if CONFIG['playlist'] is not None: CONFIG['playlist'].write_path(file_path) videos.append(BililiVideo( id = i+1, name = name, path = file_path, meta = { "aid": item["aid"], "cid": item["cid"], "epid": item["id"] }, segment_dl = CONFIG["segment_dl"], segment_size = CONFIG["segment_size"], overwrite = CONFIG["overwrite"], spider = spider )) return videos
def get_videos(url): """ 从 url 中获取视频列表 """ videos = [] if re.match(r"https?://www.bilibili.com/video/av(\d+)", url): avid = re.match(r'https?://www.bilibili.com/video/av(\d+)', url).group(1) elif re.match(r"https?://b23.tv/av(\d+)", url): avid = re.match(r"https?://b23.tv/av(\d+)", url).group(1) CONFIG["avid"] = avid info_url = info_api.format(avid=avid) res = spider.get(info_url) for i, item in enumerate(res.json()["data"]): file_path = os.path.join(CONFIG['video_dir'], repair_filename( '{}.mp4'.format(item["part"]))) if CONFIG['playlist'] is not None: CONFIG['playlist'].write_path(file_path) videos.append(BililiVideo( id = i+1, name = item["part"], path = file_path, meta = { "cid": item["cid"] }, segment_dl = CONFIG["segment_dl"], segment_size = CONFIG["segment_size"], overwrite = CONFIG["overwrite"], spider = spider )) return videos
def get_summary(url): """从课程主页面获取信息""" url = url.replace('learn/', 'course/') res = spider.get(url).text term_id = re.search(r'termId : "(\d+)"', res).group(1) names = re.findall(r'name:"(.+)"', res) course_name = " - ".join(names[1: ]) # term_ids = re.findall(r'id : "(\d+)",\ncourse', res) return term_id, repair_filename(course_name)
def get_resource(term_id, token, file_types=[VIDEO, PDF, RICH_TEXT]): """ 获取课件信息 """ resource_list = [] course_info = get_courseinfo(term_id, token) for chapter_num, chapter in enumerate( course_info.get("results").get("termDto").get("chapters")): for lesson_num, lesson in enumerate(chapter.get("lessons")): for unit_num, unit in enumerate(lesson.get("units")): if unit["contentType"] not in file_types: continue courseware_num = (chapter_num + 1, lesson_num + 1, unit_num + 1) file_path = CONFIG["file_path_template"].format( base_dir=base_dir, sep=os.path.sep, type=COURSEWARE.get(unit["contentType"], "Unknown"), cnt_1=get_section_num(courseware_num, level=1), cnt_2=get_section_num(courseware_num, level=2), cnt_3=get_section_num(courseware_num, level=3), chapter_name=repair_filename(chapter["name"]), lesson_name=repair_filename(lesson["name"]), unit_name=repair_filename(unit["name"]), ) touch_dir(os.path.dirname(file_path)) if unit["contentType"] == VIDEO: ext = ".mp4" file_path += ext playlist.write_path(file_path) resource_list.append( (VIDEO, file_path, unit["id"], unit["contentId"])) elif unit["contentType"] == PDF: file_path += ".pdf" resource_list.append( (PDF, file_path, unit["id"], unit["contentId"])) elif unit["contentType"] == RICH_TEXT: if unit.get("jsonContent"): json_content = eval(unit["jsonContent"]) file_path = CONFIG["file_path_template"].format( base_dir=base_dir, sep=os.path.sep, type="File", cnt_1=get_section_num(courseware_num, level=1), cnt_2=get_section_num(courseware_num, level=2), cnt_3=get_section_num(courseware_num, level=3), chapter_name=repair_filename(chapter["name"]), lesson_name=repair_filename(lesson["name"]), unit_name=repair_filename( os.path.splitext(json_content["fileName"])[0]) + os.path.splitext(json_content["fileName"])[1], ) touch_dir(os.path.dirname(file_path)) resource_list.append( (RICH_TEXT, file_path, json_content)) return resource_list
def parse_segment_info(video): """ 解析视频片段 url """ cid, avid = video.meta["cid"], CONFIG["avid"] # 检查是否有字幕并下载 subtitle_url = subtitle_api.format(avid=avid, cid=cid) res = spider.get(subtitle_url) subtitles_info = json.loads(re.search(r"<subtitle>(.+)</subtitle>", res.text).group(1)) for sub_info in subtitles_info["subtitles"]: sub_path = os.path.splitext(video.path)[0] + sub_info["lan_doc"] + ".srt" subtitle = Subtitle(sub_path) for sub_line in spider.get("https:"+sub_info["subtitle_url"]).json()["body"]: subtitle.write_line(sub_line["content"], sub_line["from"], sub_line["to"]) # 下载弹幕 danmaku_url = danmaku_api.format(cid=cid) res = spider.get(danmaku_url) res.encoding = "utf-8" danmaku_path = os.path.splitext(video.path)[0] + ".xml" with open(danmaku_path, "w", encoding="utf-8") as f: f.write(res.text) # 检查是否可以下载,同时搜索支持的清晰度,并匹配最佳清晰度 touch_message = spider.get(parse_api.format( avid=avid, cid=cid, qn=80)).json() if touch_message["code"] != 0: print("warn: 无法下载 {} ,原因: {}".format( video.name, touch_message["message"])) return accept_quality = touch_message['data']['accept_quality'] for qn in CONFIG['qn_seq']: if qn in accept_quality: break parse_url = parse_api.format(avid=avid, cid=cid, qn=qn) res = spider.get(parse_url) for i, segment in enumerate(res.json()['data']['durl']): id = i + 1 file_path = os.path.join(CONFIG['video_dir'], repair_filename( '{}_{:02d}.flv'.format(video.name, id))) video.segments.append(BililiVideoSegment( id = id, path = file_path, url = segment["url"], qn = qn, video = video ))
def parse_segment_info(video): """ 解析视频片段 url """ segments = [] aid, cid, ep_id = video.meta["aid"], video.meta["cid"], video.meta["epid"] # 下载弹幕 danmaku_url = danmaku_api.format(cid=cid) res = spider.get(danmaku_url) res.encoding = "utf-8" danmaku_path = os.path.splitext(video.path)[0] + ".xml" with open(danmaku_path, "w", encoding="utf-8") as f: f.write(res.text) # 检查是否可以下载,同时搜索支持的清晰度,并匹配最佳清晰度 touch_message = spider.get(parse_api.format( avid=aid, cid=cid, ep_id=ep_id, qn=80)).json() if touch_message["code"] != 0: print("warn: 无法下载 {} ,原因: {}".format( video.name, touch_message["message"])) video.switch_status(DONE) return if touch_message["result"]["is_preview"] == 1: print("warn: {} 为预览版视频".format(video.name)) accept_quality = touch_message['result']['accept_quality'] for qn in CONFIG['qn_seq']: if qn in accept_quality: break parse_url = parse_api.format(avid=aid, cid=cid, ep_id=ep_id, qn=qn) res = spider.get(parse_url) for i, segment in enumerate(res.json()['result']['durl']): id = i + 1 file_path = os.path.join(CONFIG['video_dir'], repair_filename( '{}_{:02d}.flv'.format(video.name, id))) video.segments.append(BililiVideoSegment( id = id, path = file_path, url = segment["url"], qn = qn, video = video ))
def get_resource(term_id, token): """ 获取课件信息 """ resource_list = [] course_info = get_courseinfo(term_id, token) for chapter_num, chapter in enumerate(course_info.get('results').get('termDto').get('chapters')): for lesson_num, lesson in enumerate(chapter.get('lessons')): for unit_num, unit in enumerate(lesson.get('units')): courseware_num = (chapter_num+1, lesson_num+1, unit_num+1) file_path = os.path.join( base_dir, get_section_num(courseware_num, level=1) + " " + repair_filename(chapter["name"]), get_section_num(courseware_num, level=2) + " " + repair_filename(lesson["name"]), get_section_num(courseware_num, level=3) + " " + repair_filename(unit["name"]) ) touch_dir(os.path.dirname(file_path)) if unit['contentType'] == VIDEO: ext = '.mp4' file_path += ext playlist.write_path(file_path) resource_list.append(( VIDEO, file_path, unit['id'], unit['contentId'] )) elif unit['contentType'] == PDF: file_path += ".pdf" resource_list.append(( PDF, file_path, unit['id'], unit['contentId'] )) elif unit['contentType'] == RICH_TEXT: if unit.get('jsonContent'): json_content = eval(unit['jsonContent']) file_path = os.path.join( base_dir, get_section_num(courseware_num, level=1) + " " + repair_filename(chapter["name"]), get_section_num(courseware_num, level=2) + " " + repair_filename(lesson["name"]), get_section_num(courseware_num, level=3) + " " + repair_filename(json_content["fileName"]) ) resource_list.append(( RICH_TEXT, file_path, json_content )) return resource_list
def parse(url, config): # 获取标题 CONFIG.update(config) spider.set_cookies(config["cookies"]) title = get_title(url) print(title) # 创建所需目录结构 CONFIG["base_dir"] = touch_dir(repair_filename(os.path.join( CONFIG['dir'], title + " - bilibili"))) CONFIG["video_dir"] = touch_dir(os.path.join(CONFIG['base_dir'], "Videos")) if CONFIG["playlist_type"] == "dpl": CONFIG['playlist'] = Dpl(os.path.join( CONFIG['base_dir'], 'Playlist.dpl'), path_type=CONFIG["playlist_path_type"]) elif CONFIG["playlist_type"] == "m3u": CONFIG['playlist'] = M3u(os.path.join( CONFIG['base_dir'], 'Playlist.m3u'), path_type=CONFIG["playlist_path_type"]) else: CONFIG['playlist'] = None # 获取需要的信息 videos = get_videos(url) CONFIG["videos"] = videos if CONFIG['playlist'] is not None: CONFIG['playlist'].flush() # 解析并过滤不需要的选集 episodes = parse_episodes(CONFIG["episodes"], len(videos)) videos = list(filter(lambda video: video.id in episodes, videos)) CONFIG["videos"] = videos # 解析片段信息及视频 url for i, video in enumerate(videos): print("{:02}/{:02} parsing segments info...".format(i, len(videos)), end="\r") parse_segment_info(video) # 导出下载所需数据 exports.update({ "videos": videos, "video_dir": CONFIG["video_dir"] })
def get_resource(term_id, token, file_types=[VIDEO, PDF, RICH_TEXT]): """ 获取课件信息 """ resource_list = [] course_info = get_courseinfo(term_id, token) for chapter_num, chapter in enumerate(course_info.get('results').get('termDto').get('chapters')): for lesson_num, lesson in enumerate(chapter.get('lessons')): for unit_num, unit in enumerate(lesson.get('units')): if unit['contentType'] not in file_types: continue courseware_num = (chapter_num+1, lesson_num+1, unit_num+1) file_path = CONFIG['file_path_template'].format( base_dir=base_dir, sep=os.path.sep, type=COURSEWARE.get(unit['contentType'], 'Unknown'), cnt_1=get_section_num(courseware_num, level=1), cnt_2=get_section_num(courseware_num, level=2), cnt_3=get_section_num(courseware_num, level=3), chapter_name=repair_filename(chapter["name"]), lesson_name=repair_filename(lesson["name"]), unit_name=repair_filename(unit["name"]) ) touch_dir(os.path.dirname(file_path)) if unit['contentType'] == VIDEO: ext = '.mp4' file_path += ext playlist.write_path(file_path) resource_list.append(( VIDEO, file_path, unit['id'], unit['contentId'] )) elif unit['contentType'] == PDF: file_path += ".pdf" resource_list.append(( PDF, file_path, unit['id'], unit['contentId'] )) elif unit['contentType'] == RICH_TEXT: if unit.get('jsonContent'): json_content = eval(unit['jsonContent']) file_path = CONFIG['file_path_template'].format( base_dir=base_dir, sep=os.path.sep, type='File', cnt_1=get_section_num(courseware_num, level=1), cnt_2=get_section_num(courseware_num, level=2), cnt_3=get_section_num(courseware_num, level=3), chapter_name=repair_filename(chapter["name"]), lesson_name=repair_filename(lesson["name"]), unit_name=repair_filename(os.path.splitext(json_content["fileName"])[0]) + \ os.path.splitext(json_content["fileName"])[1] ) touch_dir(os.path.dirname(file_path)) resource_list.append(( RICH_TEXT, file_path, json_content )) return resource_list