def main(course_url, config): session = model.login(site="xuetangx", conf=config) course_id_search = re.search( r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)", course_url) if course_id_search: course_id = course_id_search.group("id") main_page = "http://www.xuetangx.com/courses/{course_id}".format( course_id=course_id) info = model.out_info(url=main_page, download_path=config.Download_Path) main_path = model.generate_path([config.Download_Path, info.folder]) # 下载信息缓存列表 info_list = [] video_list = [] srt_list = [] doc_list = [] # info中提取的img_link,video_link if info.img_link: img_file_name = r"课程封面图-{title}.jpg".format(title=info.title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info.img_link)) info_list.append((info.img_link, img_file_path)) if info.video_link: video_file_name = r"课程简介-{title}.mp4".format(title=info.title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info.video_link)) info_list.append((info.video_link, video_file_path)) # 获取课程参与信息及判断是否已经参加课程 page_courseware = session.get(url="{0}/courseware".format(main_page)) if page_courseware.url.find( "about") == -1 and page_courseware.url.find( "login") == -1: # 成功获取目录 # 这里根据url判断: # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面 # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面 print("Generate Download information.") # 处理courseware页面 courseware_bs = BeautifulSoup(page_courseware.text, "lxml") chapter = courseware_bs.find_all("div", class_="chapter") for week in chapter: week_name = week.h3.a.string.strip() for lesson in week.ul.find_all("a"): # 获取课程信息 lesson_name = model.clean_filename(lesson.p.string) # 主标题 lesson_page = session.get( url="http://www.xuetangx.com{href}".format( href=lesson['href'])).text lesson_bs = BeautifulSoup(lesson_page, "lxml") tab_list = {} for tab in lesson_bs.find_all("a", role="tab"): tab_list[tab.get('id')] = re.search( "(.+)", tab.get('title')).group(1) seq_contents = lesson_bs.find_all('div', class_="seq_contents") seq_video_content_len = 0 for seq in seq_contents: if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): seq_video_content_len += 1 for i, seq in enumerate(seq_contents): seq_name = lesson_name seq_path = model.generate_path([main_path, week_name]) srt_path = model.generate_path( [main_path, "srt", week_name]) doc_path = model.generate_path( [main_path, "docs", week_name]) if seq_video_content_len > 1: # 如果只有一个的话,就不用建立子文件夹了 seq_name_raw = model.clean_filename( tab_list[seq.get("aria-labelledby")]) seq_name = r"{0} {1}".format(i, seq_name_raw) seq_path = model.generate_path( [seq_path, lesson_name]) srt_path = model.generate_path( [srt_path, lesson_name]) doc_path = model.generate_path( [doc_path, lesson_name]) if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): # 视频 lesson_ccsource = re.search( r"data-ccsource=[\'\"](.+)[\'\"]", seq.text).group(1) r = session.get( url="http://www.xuetangx.com/videoid2source/{0}" .format(lesson_ccsource)) resp_json = json.loads(r.text) if len(resp_json['sources'] ['quality20']) != 0: # AttributeError video_link = resp_json['sources']['quality20'][ 0] video_file_name = "{0}.mp4".format(seq_name) elif len(resp_json['sources']['quality10']) != 0: video_link = resp_json['sources']['quality10'][ 0] video_file_name = "{0}_sd.mp4".format(seq_name) video_file_path = model.generate_path( [seq_path, video_file_name]) print("视频: \"{name}\" \"{link}\"".format( name=video_file_name, link=video_link)) video_list.append((video_link, video_file_path)) seq_bs = BeautifulSoup(seq.text, "lxml") if config.Download_Srt and seq_bs.find( "a", text="下载字幕"): # 字幕 raw_link = seq_bs.find("a", text="下载字幕")["href"] srt_link = "http://www.xuetangx.com{0}".format( raw_link) srt_file_name = "{0}.srt".format(seq_name) srt_file_path = model.generate_path( [srt_path, srt_file_name]) print("字幕: \"{name}\" \"{link}\"".format( name=srt_file_name, link=srt_link)) srt_list.append((srt_link, srt_file_path)) if config.Download_Docs and seq_bs.find( "a", text="下载讲义"): # 讲义 raw_link = seq_bs.find("a", text="下载讲义")["href"] doc_link = "http://www.xuetangx.com{0}".format( raw_link) doc_file_name = model.clean_filename( doc_link.split("/")[-1]) doc_file_path = model.generate_path( [doc_path, doc_file_name]) print("文档: \"{name}\" \"{link}\"".format( name=doc_file_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) else: # 未登陆成功或者没参加该课程 print( "Something Error,You may not Join this course or Enter the wrong password." ) return # 处理info页面的课程讲义 page_info = session.get(url="{0}/info".format(main_page)) info_bs = BeautifulSoup(page_info.text, "lxml") doc_menu = info_bs.find("section", attrs={"aria-label": re.compile("讲义导航")}) for each in doc_menu.find_all("a"): doc_name = each["href"].split("/")[-1] doc_link = "http://www.xuetangx.com{0}".format(each["href"]) doc_file_path = model.generate_path([main_path, "docs", doc_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) # 下载 if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(info_list + video_list + doc_list) model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length ) # 需要session或者有时间期限的 else: # 默认调用自建下载 model.download_queue(session, info_list + video_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: print("No course Id,Please check!") return
def main(course_url): config = model.config("settings.conf", "icourse163") session = model.login(site="icourse163", conf=config) http_session_id = session.cookies["NTESSTUDYSI"] c_tid = re.search(r"(?:(learn)|(course))/(?P<id>(?P<c_id>[\w:+-]+)(\?tid=(?P<t_id>\d+))?)#?/?", course_url) # Download cache list main_list = [] srt_list = [] doc_list = [] # handle the course_url links to Get right courseId and termId if c_tid: if c_tid.group("t_id"): # 当使用者提供tid的时候默认使用使用者tid term_id = c_tid.group("t_id") info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('id')) else: # 否则通过info页面重新获取最新tid term_id = None print("No termId which you want to download.Will Choose the Lastest term.") info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('c_id')) # 使用课程默认地址 page_about = session.get(url=info_url) if page_about.url == page_about.request.url: # 存在该课程 # 当课程不存在的时候会302重定向到http://www.icourse163.org/,通过检查返回、请求地址是否一致判断 page_about_bs = BeautifulSoup(page_about.text, "lxml") course_info_raw = page_about_bs.find("script", text=re.compile(r"termDto")).string.replace("\n", "") if term_id is None: # 没有提供tid时候自动寻找最新课程信息 term_id = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1) # 获取课程信息 course_page_title = re.search(r'(.+?)_(.+?)_(.+?)', page_about_bs.title.string) course_title = model.clean_filename(course_page_title.group(1)) school = course_page_title.group(2) teacher = model.sort_teacher(page_about_bs.find_all('h3', class_="f-fc3")) folder = model.clean_filename('-'.join([course_title, school, teacher])) print("The Download INFO:\n" # Output download course info "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=info_url, folder=folder, id=term_id)) main_path = model.generate_path([config.Download_Path, folder]) info_img_link = page_about_bs.find("div", id="j-courseImg").img["src"] img_file_name = r"课程封面图-{title}.png".format(title=course_title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info_img_link)) main_list.append((info_img_link, img_file_path)) # intro_video video_search = re.search(r"videoId : \"(\d+)\"", course_info_raw) if video_search: payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLessonUnitPreviewVo', 'c0-id': 0, 'c0-param0': video_search.group(1), 'c0-param1': 1, 'batchId': random.randint(1000000000000, 20000000000000) } ask_video_url = "http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitPreviewVo.dwr" resp = session.post(url=ask_video_url, data=payload).text for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']: # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl' video_search_group = re.search(r's\d+.(?P<VideoType>' + str(k) + ')="(?P<dllink>.+?)";', resp) if video_search_group: info_video_link = video_search_group.group("dllink") video_file_name = r"课程简介-{title}.mp4".format(title=course_title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info_video_link)) main_list.append((info_video_link, video_file_path)) break else: print("Not found this course in \"icourse163.org\",Check Please") return # Get course's chapter payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': 0, 'c0-param0': term_id, 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text if re.search(r"var s\d+={}", rdata): print("Generate Download information.") # Data cleaning Reg week_reg = re.compile(r"s\d+.contentId=null;" r".+s\d+.lessons=(?P<lessons>s\d+)" r".+s\d+.name=\"(?P<week_name>.+?)\"") chapter_reg = re.compile(r"s\d+.chapterId=\d+;" r".+s\d+.name=\"(?P<chapter_name>.+?)\"") lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);" r".+s\d+.contentId=(?P<contentId>\d+)" r".+s\d+.contentType=(?P<contentType>\d+)" r".+s\d+.id=(?P<id>\d+)" r".+s\d+.name=\"(?P<lesson_name>.+?)\"") # count_list week_list = [] chapter_list = [] video_in_chapter_list = [] for line in rdata.splitlines(): if re.match(week_reg, line): # Week week_re = re.search(week_reg, line) week_name = model.clean_filename(model.raw_unicode_escape(week_re.group("week_name"))) week_list.append(week_name) if re.match(chapter_reg, line): # Chapter chapter_re = re.search(chapter_reg, line) chapter_name = model.clean_filename(model.raw_unicode_escape(chapter_re.group("chapter_name"))) chapter_list.append(chapter_name) print("\n", week_list[-1], chapter_list[-1]) video_in_chapter_list.append(0) if re.match(lesson_reg, line): lesson_re = re.search(lesson_reg, line) lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]]) lesson_name = model.clean_filename(model.raw_unicode_escape(lesson_re.group("lesson_name"))) lesson_content_type = int(lesson_re.group("contentType")) # prepare data and post payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': 1, 'c0-param0': lesson_re.group("contentId"), 'c0-param1': lesson_content_type, 'c0-param2': 0, 'c0-param3': lesson_re.group("id"), 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text # Sort data depend on it's contentType # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion if lesson_content_type == 1: # Video count = video_in_chapter_list[-1] count_lesson_name = model.clean_filename("{0} {lesson}".format(count, lesson=lesson_name)) for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']: # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl' if re.search(r's\d+.{0}=".+?";'.format(k), rdata): k_type = re.search("mp4(.+)Url", k).group(1) video_file_name = "{0}.mp4".format(count_lesson_name) if k_type != "Shd": video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type) video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1) video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name]) main_list.append((video_link, video_file_path)) print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link)) break # Subtitle if config.Download_Srt: srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern]) if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata): # Chinese srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.chs.srt".format(count_lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_chs_link = srt_chs_re.group("url") print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link)) srt_list.append((srt_chs_link, srt_file_path)) if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata): # English srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.eng.srt".format(lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_eng_link = srt_eng_re.group("url") print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link)) srt_list.append((srt_eng_link, srt_file_path)) video_in_chapter_list[-1] += 1 if lesson_content_type == 3 and config.Download_Docs: # Documentation doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1)) doc_name = "{0}.pdf".format(lesson_name) doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern]) doc_file_path = model.generate_path([doc_path, doc_name]) doc_list.append((doc_link, doc_file_path)) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(main_list) # 需要session或者有时间期限的,仍然使用自建下载 model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length) else: # 默认调用自建下载 model.download_queue(session, main_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: err_message = re.search(r'message:(.+)\}\)', rdata).group(1) print("Error:{0},Please make sure you login by 163-email " "and your \"Session-Cookies\" pair is right.".format(err_message)) else: print("No course Id,Please check!") return
def main(course_url, config): # handle the course_url links to Get right courseId and termId if not re.search(r'([A-Za-z]*-\d*)', course_url): print("No course Id,Please check!") return else: session = model.login(site="icourse163", conf=config) httpSessionId = session.cookies["NTESSTUDYSI"] info = model.out_info(url=course_url, download_path=config.Download_Path) main_path = model.generate_path([config.Download_Path, info.folder]) # Get course's chapter payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': httpSessionId, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': 0, 'c0-param0': info.id, 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text if re.search(r'var s\d+=\{\}', rdata): print("Generate Download information.") # 下载信息缓存列表 info_list = [] video_list = [] srt_list = [] doc_list = [] # info中提取的img_link,video_link if info.img_link: img_file_name = r"课程封面图-{title}.png".format(title=info.title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info.img_link)) info_list.append((info.img_link, img_file_path)) if info.video_link: video_file_name = r"课程简介-{title}.mp4".format(title=info.title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info.video_link)) info_list.append((info.video_link, video_file_path)) # Data cleaning Reg week_reg = re.compile(r"s\d+.contentId=null;" r".+s\d+.lessons=(?P<lessons>s\d+)" r".+s\d+.name=\"(?P<week_name>.+?)\"") chapter_reg = re.compile(r"s\d+.chapterId=\d+;" r".+s\d+.name=\"(?P<chapter_name>.+?)\"") lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);" r".+s\d+.contentId=(?P<contentId>\d+)" r".+s\d+.contentType=(?P<contentType>\d+)" r".+s\d+.id=(?P<id>\d+)" r".+s\d+.name=\"(?P<lesson_name>.+?)\"") # count_list week_list = [] chapter_list = [] video_in_chapter_list = [] for line in rdata.splitlines(): if re.match(week_reg, line): # Week week_re = re.search(week_reg, line) week_name = model.clean_filename(raw_unicode_escape(week_re.group("week_name"))) week_list.append(week_name) if re.match(chapter_reg, line): # Chapter chapter_re = re.search(chapter_reg, line) chapter_name = model.clean_filename(raw_unicode_escape(chapter_re.group("chapter_name"))) chapter_list.append(chapter_name) print("\n", week_list[-1], chapter_list[-1]) video_in_chapter_list.append(0) if re.match(lesson_reg, line): lesson_re = re.search(lesson_reg, line) lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]]) lesson_name = model.clean_filename(raw_unicode_escape(lesson_re.group("lesson_name"))) lesson_content_type = int(lesson_re.group("contentType")) # prepare data and post payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': httpSessionId, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': 1, 'c0-param0': lesson_re.group("contentId"), 'c0-param1': lesson_content_type, 'c0-param2': 0, 'c0-param3': lesson_re.group("id"), 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text # Sort data depend on it's contentType # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion if lesson_content_type == 1: # Video count = video_in_chapter_list[-1] count_lesson_name = "{0} {lesson}".format(count, lesson=lesson_name) for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']: # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl' if re.search(r's\d+.{0}=".+?";'.format(k), rdata): k_type = re.search("mp4(.+)Url", k).group(1) video_file_name = "{0}.mp4".format(count_lesson_name) if k_type != "Shd": video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type) video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1) video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name]) video_list.append((video_link, video_file_path)) print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link)) break # Subtitle if config.Download_Srt: srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern]) if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata): # Chinese srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.chs.srt".format(count_lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_chs_link = srt_chs_re.group("url") print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link)) srt_list.append((srt_chs_link, srt_file_path)) if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata): # English srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.eng.srt".format(lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_eng_link = srt_eng_re.group("url") print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link)) srt_list.append((srt_eng_link, srt_file_path)) video_in_chapter_list[-1] += 1 if lesson_content_type == 3 and config.Download_Docs: # Documentation doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1)) doc_name = "{0}.pdf".format(lesson_name) doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern]) doc_file_path = model.generate_path([doc_path, doc_name]) doc_list.append((doc_link, doc_file_path)) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(info_list + video_list) # 需要session或者有时间期限的,仍然使用自建下载 model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length) else: # 默认调用自建下载 model.download_queue(session, info_list + video_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: err_message = re.search(r'message:(.+)\}\)', rdata).group(1) print("Error:{0},Please make sure you login by 163-email " "and your \"Session-Cookies\" pair is right.".format(err_message))
def main(course_url): config = model.config("settings.conf", "xuetangx") # Loading config session = model.login(site="xuetangx", conf=config) course_id_search = re.search(r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)", course_url) # Download cache list main_list = [] srt_list = [] doc_list = [] if course_id_search: course_id = course_id_search.group("id") main_page = "http://www.xuetangx.com/courses/{course_id}".format(course_id=course_id) page_about_url = "{course_host}/about".format(course_host=main_page) page_about = session.get(url=page_about_url) if page_about.text.find("页面无法找到") == -1: # if Exist page_about_bs = BeautifulSoup(page_about.text, "lxml") # load course info course_detail_bs = page_about_bs.find("section", class_="courseabout_detail") course_name_tag = course_detail_bs.find("h3", class_="courseabout_title") course_title = model.clean_filename(course_name_tag.get_text()) school = course_name_tag.find_next("a").get_text() teacher = model.sort_teacher( page_about_bs.find("ul", class_="teacher_info").find_all("span", class_="name")) folder = model.clean_filename('-'.join([course_title, school, teacher])) print("The Download INFO:\n" # Output download course info "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=page_about_url, folder=folder, id=course_id)) main_path = model.generate_path([config.Download_Path, folder]) video_box = course_detail_bs.find('div', class_='video_box') try: info_img_link = model.link_check("http://www.xuetangx.com", video_box['data-poster']) info_video_link = get_video(session, video_box["data-ccid"]) if info_video_link: video_file_name = r"课程简介-{title}.mp4".format(title=course_title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info_video_link)) main_list.append((info_video_link, video_file_path)) except KeyError: info_img_link = model.link_check("http://www.xuetangx.com", video_box.img["src"]) if info_img_link: img_file_name = r"课程封面图-{title}.jpg".format(title=course_title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info_img_link)) main_list.append((info_img_link, img_file_path)) else: print("Not found this course in \"xuetangx.com\",Check Please") return # 获取课程参与信息及判断是否已经参加课程 page_courseware = session.get(url="{0}/courseware".format(main_page)) if page_courseware.url.find("about") == -1 and page_courseware.url.find("login") == -1: # 成功获取目录 # 这里根据url判断: # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面 # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面 print("Generate Download information.") # 处理courseware页面 courseware_bs = BeautifulSoup(page_courseware.text, "lxml") chapter = courseware_bs.find_all("div", class_="chapter") for week in chapter: week_name = model.clean_filename(week.h3.a.string.strip()) for lesson in week.ul.find_all("a"): # 获取课程信息 lesson_name = model.clean_filename(lesson.p.string) # 主标题 lesson_page = session.get(url="http://www.xuetangx.com{href}".format(href=lesson['href']), timeout=None) lesson_bs = BeautifulSoup(lesson_page.text, "lxml") tab_list = {} for tab in lesson_bs.find_all("a", role="tab"): tab_list[tab.get('id')] = re.search("(.+)", tab.get('title')).group(1) seq_contents = lesson_bs.find_all('div', class_="seq_contents") print("\n", week_name, lesson_name) seq_video_content_len = 0 for seq in seq_contents: if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): seq_video_content_len += 1 for i, seq in enumerate(seq_contents): seq_name = lesson_name seq_path = model.generate_path([main_path, week_name]) srt_path = model.generate_path([main_path, "srt", week_name]) doc_path = model.generate_path([main_path, "docs", week_name]) if seq_video_content_len > 1: # 如果只有一个的话,就不用建立子文件夹了 seq_name_raw = model.clean_filename(tab_list[seq.get("aria-labelledby")]) seq_name = r"{0} {1}".format(i, seq_name_raw) seq_path = model.generate_path([seq_path, lesson_name]) srt_path = model.generate_path([srt_path, lesson_name]) doc_path = model.generate_path([doc_path, lesson_name]) if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): # 视频 lesson_ccsource = re.search(r"data-ccsource=[\'\"](.+)[\'\"]", seq.text).group(1) video_link = get_video(session, lesson_ccsource) video_file_name = "{0}.mp4".format(seq_name) if video_link.find == -1: video_file_name = "{0}_sd.mp4".format(seq_name) video_file_path = model.generate_path([seq_path, video_file_name]) print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link)) main_list.append((video_link, video_file_path)) seq_bs = BeautifulSoup(seq.text, "lxml") if config.Download_Srt and seq_bs.find("a", text="下载字幕"): # 字幕 raw_link = seq_bs.find("a", text="下载字幕")["href"] srt_link = model.link_check("http://www.xuetangx.com", raw_link) srt_file_name = "{0}.srt".format(seq_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) print("字幕: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_link)) srt_list.append((srt_link, srt_file_path)) if config.Download_Docs and seq_bs.find("a", text="下载讲义"): # 讲义 raw_link = seq_bs.find("a", text="下载讲义")["href"] doc_link = model.link_check("http://www.xuetangx.com", raw_link) doc_file_name = model.clean_filename(doc_link.split("/")[-1]) doc_file_path = model.generate_path([doc_path, doc_file_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_file_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) else: # 未登陆成功或者没参加该课程 print("Something Error,You may not Join this course or Enter the wrong password.") return # 处理info页面的课程讲义 page_info = session.get(url="{0}/info".format(main_page)) info_bs = BeautifulSoup(page_info.text, "lxml") doc_menu = info_bs.find("section", attrs={"aria-label": re.compile("讲义导航")}) for each in doc_menu.find_all("a"): doc_name = each["href"].split("/")[-1] doc_link = model.link_check("http://www.xuetangx.com", each["href"]) doc_file_path = model.generate_path([main_path, "docs", doc_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) # 下载 if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(main_list + doc_list) model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length) # 需要session或者有时间期限的 else: # 默认调用自建下载 model.download_queue(session, main_list + srt_list + doc_list,queue_length=config.Download_Queue_Length) else: print("No course Id,Please check!") return