Ejemplo n.º 1
0
def main(course_url, config):
    session = model.login(site="xuetangx", conf=config)
    course_id_search = re.search(
        r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)",
        course_url)
    if course_id_search:
        course_id = course_id_search.group("id")
        main_page = "http://www.xuetangx.com/courses/{course_id}".format(
            course_id=course_id)
        info = model.out_info(url=main_page,
                              download_path=config.Download_Path)
        main_path = model.generate_path([config.Download_Path, info.folder])

        # 下载信息缓存列表
        info_list = []
        video_list = []
        srt_list = []
        doc_list = []

        # info中提取的img_link,video_link
        if info.img_link:
            img_file_name = r"课程封面图-{title}.jpg".format(title=info.title)
            img_file_path = model.generate_path([main_path, img_file_name])
            print("课程封面图: {link}".format(link=info.img_link))
            info_list.append((info.img_link, img_file_path))
        if info.video_link:
            video_file_name = r"课程简介-{title}.mp4".format(title=info.title)
            video_file_path = model.generate_path([main_path, video_file_name])
            print("课程简介视频: {link}".format(link=info.video_link))
            info_list.append((info.video_link, video_file_path))

        # 获取课程参与信息及判断是否已经参加课程
        page_courseware = session.get(url="{0}/courseware".format(main_page))
        if page_courseware.url.find(
                "about") == -1 and page_courseware.url.find(
                    "login") == -1:  # 成功获取目录
            # 这里根据url判断:
            # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面
            # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面
            print("Generate Download information.")

            # 处理courseware页面
            courseware_bs = BeautifulSoup(page_courseware.text, "lxml")
            chapter = courseware_bs.find_all("div", class_="chapter")

            for week in chapter:
                week_name = week.h3.a.string.strip()
                for lesson in week.ul.find_all("a"):
                    # 获取课程信息
                    lesson_name = model.clean_filename(lesson.p.string)  # 主标题
                    lesson_page = session.get(
                        url="http://www.xuetangx.com{href}".format(
                            href=lesson['href'])).text
                    lesson_bs = BeautifulSoup(lesson_page, "lxml")

                    tab_list = {}
                    for tab in lesson_bs.find_all("a", role="tab"):
                        tab_list[tab.get('id')] = re.search(
                            "(.+)", tab.get('title')).group(1)

                    seq_contents = lesson_bs.find_all('div',
                                                      class_="seq_contents")

                    seq_video_content_len = 0
                    for seq in seq_contents:
                        if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text):
                            seq_video_content_len += 1

                    for i, seq in enumerate(seq_contents):
                        seq_name = lesson_name
                        seq_path = model.generate_path([main_path, week_name])
                        srt_path = model.generate_path(
                            [main_path, "srt", week_name])
                        doc_path = model.generate_path(
                            [main_path, "docs", week_name])
                        if seq_video_content_len > 1:  # 如果只有一个的话,就不用建立子文件夹了
                            seq_name_raw = model.clean_filename(
                                tab_list[seq.get("aria-labelledby")])
                            seq_name = r"{0} {1}".format(i, seq_name_raw)
                            seq_path = model.generate_path(
                                [seq_path, lesson_name])
                            srt_path = model.generate_path(
                                [srt_path, lesson_name])
                            doc_path = model.generate_path(
                                [doc_path, lesson_name])

                        if re.search(r"data-type=[\'\"]Video[\'\"]",
                                     seq.text):  # 视频
                            lesson_ccsource = re.search(
                                r"data-ccsource=[\'\"](.+)[\'\"]",
                                seq.text).group(1)
                            r = session.get(
                                url="http://www.xuetangx.com/videoid2source/{0}"
                                .format(lesson_ccsource))
                            resp_json = json.loads(r.text)

                            if len(resp_json['sources']
                                   ['quality20']) != 0:  # AttributeError
                                video_link = resp_json['sources']['quality20'][
                                    0]
                                video_file_name = "{0}.mp4".format(seq_name)
                            elif len(resp_json['sources']['quality10']) != 0:
                                video_link = resp_json['sources']['quality10'][
                                    0]
                                video_file_name = "{0}_sd.mp4".format(seq_name)
                            video_file_path = model.generate_path(
                                [seq_path, video_file_name])
                            print("视频: \"{name}\" \"{link}\"".format(
                                name=video_file_name, link=video_link))
                            video_list.append((video_link, video_file_path))

                            seq_bs = BeautifulSoup(seq.text, "lxml")
                            if config.Download_Srt and seq_bs.find(
                                    "a", text="下载字幕"):  # 字幕
                                raw_link = seq_bs.find("a",
                                                       text="下载字幕")["href"]
                                srt_link = "http://www.xuetangx.com{0}".format(
                                    raw_link)
                                srt_file_name = "{0}.srt".format(seq_name)
                                srt_file_path = model.generate_path(
                                    [srt_path, srt_file_name])
                                print("字幕: \"{name}\" \"{link}\"".format(
                                    name=srt_file_name, link=srt_link))
                                srt_list.append((srt_link, srt_file_path))
                            if config.Download_Docs and seq_bs.find(
                                    "a", text="下载讲义"):  # 讲义
                                raw_link = seq_bs.find("a",
                                                       text="下载讲义")["href"]
                                doc_link = "http://www.xuetangx.com{0}".format(
                                    raw_link)
                                doc_file_name = model.clean_filename(
                                    doc_link.split("/")[-1])
                                doc_file_path = model.generate_path(
                                    [doc_path, doc_file_name])
                                print("文档: \"{name}\" \"{link}\"".format(
                                    name=doc_file_name, link=doc_link))
                                doc_list.append((doc_link, doc_file_path))

        else:  # 未登陆成功或者没参加该课程
            print(
                "Something Error,You may not Join this course or Enter the wrong password."
            )
            return

        # 处理info页面的课程讲义
        page_info = session.get(url="{0}/info".format(main_page))
        info_bs = BeautifulSoup(page_info.text, "lxml")
        doc_menu = info_bs.find("section",
                                attrs={"aria-label": re.compile("讲义导航")})
        for each in doc_menu.find_all("a"):
            doc_name = each["href"].split("/")[-1]
            doc_link = "http://www.xuetangx.com{0}".format(each["href"])
            doc_file_path = model.generate_path([main_path, "docs", doc_name])
            print("文档: \"{name}\" \"{link}\"".format(name=doc_name,
                                                     link=doc_link))
            doc_list.append((doc_link, doc_file_path))

        # 下载
        if config.Download:
            if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                model.aira2_download(info_list + video_list + doc_list)
                model.download_queue(session,
                                     srt_list,
                                     queue_length=config.Download_Queue_Length
                                     )  # 需要session或者有时间期限的
            else:  # 默认调用自建下载
                model.download_queue(session,
                                     info_list + video_list + srt_list +
                                     doc_list,
                                     queue_length=config.Download_Queue_Length)

    else:
        print("No course Id,Please check!")

    return
Ejemplo n.º 2
0
def main(course_url):
    config = model.config("settings.conf", "icourse163")
    session = model.login(site="icourse163", conf=config)
    http_session_id = session.cookies["NTESSTUDYSI"]
    c_tid = re.search(r"(?:(learn)|(course))/(?P<id>(?P<c_id>[\w:+-]+)(\?tid=(?P<t_id>\d+))?)#?/?", course_url)

    # Download cache list
    main_list = []
    srt_list = []
    doc_list = []

    # handle the course_url links to Get right courseId and termId
    if c_tid:
        if c_tid.group("t_id"):  # 当使用者提供tid的时候默认使用使用者tid
            term_id = c_tid.group("t_id")
            info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('id'))
        else:  # 否则通过info页面重新获取最新tid
            term_id = None
            print("No termId which you want to download.Will Choose the Lastest term.")
            info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('c_id'))  # 使用课程默认地址
        page_about = session.get(url=info_url)
        if page_about.url == page_about.request.url:  # 存在该课程
            # 当课程不存在的时候会302重定向到http://www.icourse163.org/,通过检查返回、请求地址是否一致判断
            page_about_bs = BeautifulSoup(page_about.text, "lxml")
            course_info_raw = page_about_bs.find("script", text=re.compile(r"termDto")).string.replace("\n", "")
            if term_id is None:  # 没有提供tid时候自动寻找最新课程信息
                term_id = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1)
            # 获取课程信息
            course_page_title = re.search(r'(.+?)_(.+?)_(.+?)', page_about_bs.title.string)
            course_title = model.clean_filename(course_page_title.group(1))
            school = course_page_title.group(2)
            teacher = model.sort_teacher(page_about_bs.find_all('h3', class_="f-fc3"))
            folder = model.clean_filename('-'.join([course_title, school, teacher]))

            print("The Download INFO:\n"  # Output download course info
                  "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=info_url, folder=folder, id=term_id))

            main_path = model.generate_path([config.Download_Path, folder])

            info_img_link = page_about_bs.find("div", id="j-courseImg").img["src"]
            img_file_name = r"课程封面图-{title}.png".format(title=course_title)
            img_file_path = model.generate_path([main_path, img_file_name])
            print("课程封面图: {link}".format(link=info_img_link))
            main_list.append((info_img_link, img_file_path))

            # intro_video
            video_search = re.search(r"videoId : \"(\d+)\"", course_info_raw)
            if video_search:
                payload = {
                    'callCount': 1,
                    'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
                    'httpSessionId': http_session_id,
                    'c0-scriptName': 'CourseBean',
                    'c0-methodName': 'getLessonUnitPreviewVo',
                    'c0-id': 0,
                    'c0-param0': video_search.group(1),
                    'c0-param1': 1,
                    'batchId': random.randint(1000000000000, 20000000000000)
                }
                ask_video_url = "http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitPreviewVo.dwr"
                resp = session.post(url=ask_video_url, data=payload).text
                for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']:  # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl'
                    video_search_group = re.search(r's\d+.(?P<VideoType>' + str(k) + ')="(?P<dllink>.+?)";', resp)
                    if video_search_group:
                        info_video_link = video_search_group.group("dllink")
                        video_file_name = r"课程简介-{title}.mp4".format(title=course_title)
                        video_file_path = model.generate_path([main_path, video_file_name])
                        print("课程简介视频: {link}".format(link=info_video_link))
                        main_list.append((info_video_link, video_file_path))
                        break
        else:
            print("Not found this course in \"icourse163.org\",Check Please")
            return

        # Get course's chapter
        payload = {
            'callCount': 1,
            'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
            'httpSessionId': http_session_id,
            'c0-scriptName': 'CourseBean',
            'c0-methodName': 'getLastLearnedMocTermDto',
            'c0-id': 0,
            'c0-param0': term_id,
            'batchId': random.randint(1000000000000, 20000000000000)
        }
        cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
        rdata = session.post(cs_url, data=payload, timeout=None).text

        if re.search(r"var s\d+={}", rdata):
            print("Generate Download information.")

            # Data cleaning Reg
            week_reg = re.compile(r"s\d+.contentId=null;"
                                  r".+s\d+.lessons=(?P<lessons>s\d+)"
                                  r".+s\d+.name=\"(?P<week_name>.+?)\"")
            chapter_reg = re.compile(r"s\d+.chapterId=\d+;"
                                     r".+s\d+.name=\"(?P<chapter_name>.+?)\"")
            lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);"
                                    r".+s\d+.contentId=(?P<contentId>\d+)"
                                    r".+s\d+.contentType=(?P<contentType>\d+)"
                                    r".+s\d+.id=(?P<id>\d+)"
                                    r".+s\d+.name=\"(?P<lesson_name>.+?)\"")

            # count_list
            week_list = []
            chapter_list = []
            video_in_chapter_list = []

            for line in rdata.splitlines():
                if re.match(week_reg, line):  # Week
                    week_re = re.search(week_reg, line)
                    week_name = model.clean_filename(model.raw_unicode_escape(week_re.group("week_name")))
                    week_list.append(week_name)
                if re.match(chapter_reg, line):  # Chapter
                    chapter_re = re.search(chapter_reg, line)
                    chapter_name = model.clean_filename(model.raw_unicode_escape(chapter_re.group("chapter_name")))
                    chapter_list.append(chapter_name)
                    print("\n", week_list[-1], chapter_list[-1])
                    video_in_chapter_list.append(0)
                if re.match(lesson_reg, line):
                    lesson_re = re.search(lesson_reg, line)
                    lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]])

                    lesson_name = model.clean_filename(model.raw_unicode_escape(lesson_re.group("lesson_name")))
                    lesson_content_type = int(lesson_re.group("contentType"))

                    # prepare data and post
                    payload = {
                        'callCount': 1,
                        'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
                        'httpSessionId': http_session_id,
                        'c0-scriptName': 'CourseBean',
                        'c0-methodName': 'getLessonUnitLearnVo',
                        'c0-id': 1,
                        'c0-param0': lesson_re.group("contentId"),
                        'c0-param1': lesson_content_type,
                        'c0-param2': 0,
                        'c0-param3': lesson_re.group("id"),
                        'batchId': random.randint(1000000000000, 20000000000000)
                    }
                    cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'

                    rdata = session.post(cs_url, data=payload, timeout=None).text
                    # Sort data depend on it's contentType
                    # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion
                    if lesson_content_type == 1:  # Video
                        count = video_in_chapter_list[-1]
                        count_lesson_name = model.clean_filename("{0} {lesson}".format(count, lesson=lesson_name))
                        for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']:  # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl'
                            if re.search(r's\d+.{0}=".+?";'.format(k), rdata):
                                k_type = re.search("mp4(.+)Url", k).group(1)
                                video_file_name = "{0}.mp4".format(count_lesson_name)
                                if k_type != "Shd":
                                    video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type)
                                video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1)
                                video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name])
                                main_list.append((video_link, video_file_path))
                                print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link))
                                break
                        # Subtitle
                        if config.Download_Srt:
                            srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern])
                            if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata):  # Chinese
                                srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.chs.srt".format(count_lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_chs_link = srt_chs_re.group("url")
                                print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link))
                                srt_list.append((srt_chs_link, srt_file_path))
                            if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata):  # English
                                srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.eng.srt".format(lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_eng_link = srt_eng_re.group("url")
                                print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link))
                                srt_list.append((srt_eng_link, srt_file_path))
                        video_in_chapter_list[-1] += 1

                    if lesson_content_type == 3 and config.Download_Docs:  # Documentation
                        doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
                        doc_name = "{0}.pdf".format(lesson_name)
                        doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern])
                        doc_file_path = model.generate_path([doc_path, doc_name])
                        doc_list.append((doc_link, doc_file_path))
                        print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))

            if config.Download:
                if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                    model.aira2_download(main_list)
                    # 需要session或者有时间期限的,仍然使用自建下载
                    model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length)
                else:  # 默认调用自建下载
                    model.download_queue(session, main_list + srt_list + doc_list,
                                         queue_length=config.Download_Queue_Length)
        else:
            err_message = re.search(r'message:(.+)\}\)', rdata).group(1)
            print("Error:{0},Please make sure you login by 163-email "
                  "and your \"Session-Cookies\" pair is right.".format(err_message))
    else:
        print("No course Id,Please check!")
        return
Ejemplo n.º 3
0
def main(course_url, config):
    # handle the course_url links to Get right courseId and termId
    if not re.search(r'([A-Za-z]*-\d*)', course_url):
        print("No course Id,Please check!")
        return
    else:
        session = model.login(site="icourse163", conf=config)
        httpSessionId = session.cookies["NTESSTUDYSI"]

        info = model.out_info(url=course_url, download_path=config.Download_Path)
        main_path = model.generate_path([config.Download_Path, info.folder])

        # Get course's chapter
        payload = {
            'callCount': 1,
            'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
            'httpSessionId': httpSessionId,
            'c0-scriptName': 'CourseBean',
            'c0-methodName': 'getLastLearnedMocTermDto',
            'c0-id': 0,
            'c0-param0': info.id,
            'batchId': random.randint(1000000000000, 20000000000000)
        }
        cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
        rdata = session.post(cs_url, data=payload, timeout=None).text

        if re.search(r'var s\d+=\{\}', rdata):
            print("Generate Download information.")
            # 下载信息缓存列表
            info_list = []
            video_list = []
            srt_list = []
            doc_list = []

            # info中提取的img_link,video_link
            if info.img_link:
                img_file_name = r"课程封面图-{title}.png".format(title=info.title)
                img_file_path = model.generate_path([main_path, img_file_name])
                print("课程封面图: {link}".format(link=info.img_link))
                info_list.append((info.img_link, img_file_path))
            if info.video_link:
                video_file_name = r"课程简介-{title}.mp4".format(title=info.title)
                video_file_path = model.generate_path([main_path, video_file_name])
                print("课程简介视频: {link}".format(link=info.video_link))
                info_list.append((info.video_link, video_file_path))

            # Data cleaning Reg
            week_reg = re.compile(r"s\d+.contentId=null;"
                                  r".+s\d+.lessons=(?P<lessons>s\d+)"
                                  r".+s\d+.name=\"(?P<week_name>.+?)\"")
            chapter_reg = re.compile(r"s\d+.chapterId=\d+;"
                                     r".+s\d+.name=\"(?P<chapter_name>.+?)\"")
            lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);"
                                    r".+s\d+.contentId=(?P<contentId>\d+)"
                                    r".+s\d+.contentType=(?P<contentType>\d+)"
                                    r".+s\d+.id=(?P<id>\d+)"
                                    r".+s\d+.name=\"(?P<lesson_name>.+?)\"")

            # count_list
            week_list = []
            chapter_list = []
            video_in_chapter_list = []

            for line in rdata.splitlines():
                if re.match(week_reg, line):  # Week
                    week_re = re.search(week_reg, line)
                    week_name = model.clean_filename(raw_unicode_escape(week_re.group("week_name")))
                    week_list.append(week_name)
                if re.match(chapter_reg, line):  # Chapter
                    chapter_re = re.search(chapter_reg, line)
                    chapter_name = model.clean_filename(raw_unicode_escape(chapter_re.group("chapter_name")))
                    chapter_list.append(chapter_name)
                    print("\n", week_list[-1], chapter_list[-1])
                    video_in_chapter_list.append(0)
                if re.match(lesson_reg, line):
                    lesson_re = re.search(lesson_reg, line)
                    lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]])

                    lesson_name = model.clean_filename(raw_unicode_escape(lesson_re.group("lesson_name")))
                    lesson_content_type = int(lesson_re.group("contentType"))

                    # prepare data and post
                    payload = {
                        'callCount': 1,
                        'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
                        'httpSessionId': httpSessionId,
                        'c0-scriptName': 'CourseBean',
                        'c0-methodName': 'getLessonUnitLearnVo',
                        'c0-id': 1,
                        'c0-param0': lesson_re.group("contentId"),
                        'c0-param1': lesson_content_type,
                        'c0-param2': 0,
                        'c0-param3': lesson_re.group("id"),
                        'batchId': random.randint(1000000000000, 20000000000000)
                    }
                    cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'

                    rdata = session.post(cs_url, data=payload, timeout=None).text
                    # Sort data depend on it's contentType
                    # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion
                    if lesson_content_type == 1:  # Video
                        count = video_in_chapter_list[-1]
                        count_lesson_name = "{0} {lesson}".format(count, lesson=lesson_name)
                        for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']:  # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl'
                            if re.search(r's\d+.{0}=".+?";'.format(k), rdata):
                                k_type = re.search("mp4(.+)Url", k).group(1)
                                video_file_name = "{0}.mp4".format(count_lesson_name)
                                if k_type != "Shd":
                                    video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type)
                                video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1)
                                video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name])
                                video_list.append((video_link, video_file_path))
                                print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link))
                                break
                        # Subtitle
                        if config.Download_Srt:
                            srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern])
                            if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata):  # Chinese
                                srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.chs.srt".format(count_lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_chs_link = srt_chs_re.group("url")
                                print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link))
                                srt_list.append((srt_chs_link, srt_file_path))
                            if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata):  # English
                                srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.eng.srt".format(lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_eng_link = srt_eng_re.group("url")
                                print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link))
                                srt_list.append((srt_eng_link, srt_file_path))
                        video_in_chapter_list[-1] += 1

                    if lesson_content_type == 3 and config.Download_Docs:  # Documentation
                        doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
                        doc_name = "{0}.pdf".format(lesson_name)
                        doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern])
                        doc_file_path = model.generate_path([doc_path, doc_name])
                        doc_list.append((doc_link, doc_file_path))
                        print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))

            if config.Download:
                if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                    model.aira2_download(info_list + video_list)
                    # 需要session或者有时间期限的,仍然使用自建下载
                    model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length)
                else:  # 默认调用自建下载
                    model.download_queue(session, info_list + video_list + srt_list + doc_list,
                                         queue_length=config.Download_Queue_Length)
        else:
            err_message = re.search(r'message:(.+)\}\)', rdata).group(1)
            print("Error:{0},Please make sure you login by 163-email "
                  "and your \"Session-Cookies\" pair is right.".format(err_message))
Ejemplo n.º 4
0
def main(course_url):
    config = model.config("settings.conf", "xuetangx")  # Loading config
    session = model.login(site="xuetangx", conf=config)
    course_id_search = re.search(r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)", course_url)

    # Download cache list
    main_list = []
    srt_list = []
    doc_list = []

    if course_id_search:
        course_id = course_id_search.group("id")
        main_page = "http://www.xuetangx.com/courses/{course_id}".format(course_id=course_id)

        page_about_url = "{course_host}/about".format(course_host=main_page)
        page_about = session.get(url=page_about_url)
        if page_about.text.find("页面无法找到") == -1:  # if Exist
            page_about_bs = BeautifulSoup(page_about.text, "lxml")
            # load course info
            course_detail_bs = page_about_bs.find("section", class_="courseabout_detail")
            course_name_tag = course_detail_bs.find("h3", class_="courseabout_title")

            course_title = model.clean_filename(course_name_tag.get_text())
            school = course_name_tag.find_next("a").get_text()
            teacher = model.sort_teacher(
                page_about_bs.find("ul", class_="teacher_info").find_all("span", class_="name"))
            folder = model.clean_filename('-'.join([course_title, school, teacher]))

            print("The Download INFO:\n"  # Output download course info
                  "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=page_about_url, folder=folder, id=course_id))

            main_path = model.generate_path([config.Download_Path, folder])

            video_box = course_detail_bs.find('div', class_='video_box')
            try:
                info_img_link = model.link_check("http://www.xuetangx.com", video_box['data-poster'])
                info_video_link = get_video(session, video_box["data-ccid"])
                if info_video_link:
                    video_file_name = r"课程简介-{title}.mp4".format(title=course_title)
                    video_file_path = model.generate_path([main_path, video_file_name])
                    print("课程简介视频: {link}".format(link=info_video_link))
                    main_list.append((info_video_link, video_file_path))
            except KeyError:
                info_img_link = model.link_check("http://www.xuetangx.com", video_box.img["src"])

            if info_img_link:
                img_file_name = r"课程封面图-{title}.jpg".format(title=course_title)
                img_file_path = model.generate_path([main_path, img_file_name])
                print("课程封面图: {link}".format(link=info_img_link))
                main_list.append((info_img_link, img_file_path))
        else:
            print("Not found this course in \"xuetangx.com\",Check Please")
            return

        # 获取课程参与信息及判断是否已经参加课程
        page_courseware = session.get(url="{0}/courseware".format(main_page))
        if page_courseware.url.find("about") == -1 and page_courseware.url.find("login") == -1:  # 成功获取目录
            # 这里根据url判断:
            # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面
            # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面
            print("Generate Download information.")

            # 处理courseware页面
            courseware_bs = BeautifulSoup(page_courseware.text, "lxml")
            chapter = courseware_bs.find_all("div", class_="chapter")

            for week in chapter:
                week_name = model.clean_filename(week.h3.a.string.strip())
                for lesson in week.ul.find_all("a"):
                    # 获取课程信息
                    lesson_name = model.clean_filename(lesson.p.string)  # 主标题
                    lesson_page = session.get(url="http://www.xuetangx.com{href}".format(href=lesson['href']),
                                              timeout=None)
                    lesson_bs = BeautifulSoup(lesson_page.text, "lxml")

                    tab_list = {}
                    for tab in lesson_bs.find_all("a", role="tab"):
                        tab_list[tab.get('id')] = re.search("(.+)", tab.get('title')).group(1)

                    seq_contents = lesson_bs.find_all('div', class_="seq_contents")

                    print("\n", week_name, lesson_name)

                    seq_video_content_len = 0
                    for seq in seq_contents:
                        if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text):
                            seq_video_content_len += 1

                    for i, seq in enumerate(seq_contents):
                        seq_name = lesson_name
                        seq_path = model.generate_path([main_path, week_name])
                        srt_path = model.generate_path([main_path, "srt", week_name])
                        doc_path = model.generate_path([main_path, "docs", week_name])
                        if seq_video_content_len > 1:  # 如果只有一个的话,就不用建立子文件夹了
                            seq_name_raw = model.clean_filename(tab_list[seq.get("aria-labelledby")])
                            seq_name = r"{0} {1}".format(i, seq_name_raw)
                            seq_path = model.generate_path([seq_path, lesson_name])
                            srt_path = model.generate_path([srt_path, lesson_name])
                            doc_path = model.generate_path([doc_path, lesson_name])

                        if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text):  # 视频
                            lesson_ccsource = re.search(r"data-ccsource=[\'\"](.+)[\'\"]", seq.text).group(1)
                            video_link = get_video(session, lesson_ccsource)
                            video_file_name = "{0}.mp4".format(seq_name)
                            if video_link.find == -1:
                                video_file_name = "{0}_sd.mp4".format(seq_name)
                            video_file_path = model.generate_path([seq_path, video_file_name])
                            print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link))
                            main_list.append((video_link, video_file_path))

                            seq_bs = BeautifulSoup(seq.text, "lxml")
                            if config.Download_Srt and seq_bs.find("a", text="下载字幕"):  # 字幕
                                raw_link = seq_bs.find("a", text="下载字幕")["href"]
                                srt_link = model.link_check("http://www.xuetangx.com", raw_link)
                                srt_file_name = "{0}.srt".format(seq_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                print("字幕: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_link))
                                srt_list.append((srt_link, srt_file_path))
                            if config.Download_Docs and seq_bs.find("a", text="下载讲义"):  # 讲义
                                raw_link = seq_bs.find("a", text="下载讲义")["href"]
                                doc_link = model.link_check("http://www.xuetangx.com", raw_link)
                                doc_file_name = model.clean_filename(doc_link.split("/")[-1])
                                doc_file_path = model.generate_path([doc_path, doc_file_name])
                                print("文档: \"{name}\" \"{link}\"".format(name=doc_file_name, link=doc_link))
                                doc_list.append((doc_link, doc_file_path))

        else:  # 未登陆成功或者没参加该课程
            print("Something Error,You may not Join this course or Enter the wrong password.")
            return

        # 处理info页面的课程讲义
        page_info = session.get(url="{0}/info".format(main_page))
        info_bs = BeautifulSoup(page_info.text, "lxml")
        doc_menu = info_bs.find("section", attrs={"aria-label": re.compile("讲义导航")})
        for each in doc_menu.find_all("a"):
            doc_name = each["href"].split("/")[-1]
            doc_link = model.link_check("http://www.xuetangx.com", each["href"])
            doc_file_path = model.generate_path([main_path, "docs", doc_name])
            print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))
            doc_list.append((doc_link, doc_file_path))

        # 下载
        if config.Download:
            if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                model.aira2_download(main_list + doc_list)
                model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length)  # 需要session或者有时间期限的
            else:  # 默认调用自建下载
                model.download_queue(session, main_list + srt_list + doc_list,queue_length=config.Download_Queue_Length)

    else:
        print("No course Id,Please check!")

    return