Beispiel #1
0
    def download_m3u8(self):
        jzw_info = self.get_jzw_info()
        jzw_title = jzw_info["jzw_title"]
        stage_infos = jzw_info["stage_infos"]
        for stage_info in stage_infos:
            state_title = stage_info["state_title"]
            week_infos = stage_info["week_infos"]
            for week_info in week_infos:
                week_title = week_info["week_title"]
                course_infos = week_info["course_infos"]
                for course_info in course_infos:
                    course_name = course_info['course_name']
                    course_url = course_info['href']
                    index = course_info['index']

                    jzw_title = tools.filename_reg_check(jzw_title)
                    state_title = tools.filename_reg_check(state_title)
                    week_title = tools.filename_reg_check(week_title)
                    course_name = tools.filename_reg_check(course_name)

                    course_path = tools.join_path(self.path, jzw_title,
                                                  state_title, week_title)
                    tools.check_or_make_dir(course_path)
                    # 有些课程是考试, url是/exam/123
                    if "course/" in course_url:
                        downloader = JzwCourseM3u8Downloader(
                            self.session, course_url, course_path,
                            str(index) + ". " + course_name)
                        downloader.download_m3u8()
Beispiel #2
0
    def download_m3u8(self):
        course_info = self.get_course_info(forceDownload=True)
        course_name = course_info["title"]
        for chapter in course_info['chapters']:
            chapter_name = chapter['chapter_title']
            for lesson in chapter["lessons"]:
                lesson_name = lesson['lesson_name']
                media_id = lesson['media_id']
                lesson_type = lesson['type_icon']

                course_name = tools.filename_reg_check(course_name)
                chapter_name = tools.filename_reg_check(chapter_name)
                lesson_name = tools.filename_reg_check(lesson_name)

                chapter_path = tools.join_path(self.root_path, chapter_name)
                tools.check_or_make_dir(self.root_path)

                lesson_path = tools.join_path(chapter_path, lesson_name)
                tools.check_or_make_dir(lesson_path)
                print(lesson_path)
                downloader = JzwLessonM3u8Downloader(self.session,
                                                     self.course_id, media_id,
                                                     lesson_name, chapter_path)
                if "-video" in lesson_type:
                    downloader.download_m3u8()
                else:
                    downloader.download_media_info()
Beispiel #3
0
 def __init__(self, session, jzw_url, path, jzw_name=""):
     self.session = session
     self.jzw_url = jzw_url
     self.path = path
     if jzw_name:
         jzw_name = tools.filename_reg_check(jzw_name)
         self.root_path = tools.join_path(self.path, jzw_name)
Beispiel #4
0
    def get_jzw_info(self, forceDownload=False):
        html = self.get_html()
        html_xpath = etree.HTML(html)
        jzw_title = html_xpath.xpath(
            '//h1[@class="stage-title"]/a/text()')[0].strip()

        jzw_title = tools.filename_reg_check(jzw_title)
        if not self.root_path:
            self.root_path = tools.join_path(self.path, jzw_title)
        tools.check_or_make_dir(self.root_path)
        self.jzw_info_json_path = tools.join_path(self.root_path,
                                                  self.JZW_INFO_JSON)
        if not forceDownload and os.path.exists(self.jzw_info_json_path):
            with open(self.jzw_info_json_path, "r", encoding="utf8") as f:
                content = f.read()
                if content:
                    info = json.loads(content)
                    return info

        stages_xpath = html_xpath.xpath(
            '//div[contains(@class,"stage-box js-stage-box")]')
        stage_infos = []
        jzw = dict(jzw_title=jzw_title, stage_infos=stage_infos)
        for stage_xpath in stages_xpath:
            state_title = stage_xpath.xpath(
                'div[contains(@class,"stage-title")]/text()')[0].strip()
            weeks_xpath = stage_xpath.xpath(
                'div/div[contains(@class,"week-box")]')
            week_infos = []
            stage_info = dict(state_title=state_title, week_infos=week_infos)
            stage_infos.append(stage_info)
            for week_xpath in weeks_xpath:
                week_title = week_xpath.xpath(
                    'div[contains(@class,"week-title")]/text()')[0].strip()
                courses_xpath = week_xpath.xpath(
                    'div[contains(@class,"class-box")]/a[@class="class def"]')
                course_infos = []
                week_info = dict(week_title=week_title,
                                 course_infos=course_infos)
                week_infos.append(week_info)
                index = 0
                for course_xpath in courses_xpath:
                    course_name = course_xpath.xpath(
                        'div/div[@class="class-name"]/text()')[0].strip()
                    href = "https://class.imooc.com" + \
                        course_xpath.attrib['href']  # /course/1330
                    course_id = int(href.split("/")[-1])
                    index += 1
                    course_info = dict(course_name=course_name,
                                       href=href,
                                       course_id=course_id,
                                       index=index)
                    course_infos.append(course_info)
        data_json = json.dumps(jzw, ensure_ascii=False, indent=2)
        with open(self.jzw_info_json_path, "w", encoding="utf8") as f:
            f.write(data_json)
        return jzw
 def __init__(self, session, course_id, media_id, lesson_name, path):
     self.session = session
     self.course_id = course_id
     self.media_id = media_id
     self.lesson_name = lesson_name
     # 视频保存的绝对路径
     self.path = path
     lesson_name = tools.filename_reg_check(lesson_name)
     self.lesson_path = tools.join_path(self.path,
                                        lesson_name)  #key、m3u8保存的位置
Beispiel #6
0
    def get_course_info(self, forceDownload=False):
        """获取课程信息, 解析章节目录
        课程(course)有多个章节(chapter), 章节有多个课时(lesson)
        格式参考course_info.json

        Args:
            url ([type]): course的url, 如: https://class.imooc.com/course/1330

        Returns:
            [type]: [description]
        """

        html = self.get_course_html()
        html_xpath = etree.HTML(html)

        title = html_xpath.xpath("/html/body//h1/a/text()")[0].strip()
        title = tools.filename_reg_check(title)
        if not self.root_path:
            self.root_path = tools.join_path(self.path, title)
        tools.check_or_make_dir(self.root_path)
        self.course_info_json_path = tools.join_path(self.root_path,
                                                     self.COURSE_INFO_JSON)
        if not forceDownload and os.path.exists(self.course_info_json_path):
            with open(self.course_info_json_path, "r", encoding="utf8") as f:
                content = f.read()
                if content:
                    info = json.loads(content)
                    return info

        introduction = html_xpath.xpath('//p[@class="con"]/text()')[0].strip()
        chapters_xpath = html_xpath.xpath(
            '//div[contains(@class,"chapter-item")]')
        chapter_infos = []
        for chapter_ in chapters_xpath:
            chapter_title = chapter_.xpath("h2/text()")[0].strip()
            lessons = []
            for a_xpath in chapter_.xpath('ul/li/a'):
                names = a_xpath.xpath(
                    'span[not(contains(@class,"finished"))]/text()')
                lesson_name = ''.join(names)
                lesson_name = lesson_name.replace(" ", "").replace("\n",
                                                                   "").strip()
                # 图标的类型, 可以用来表示课程类型, 视频、练习、图文等等
                type_icon = a_xpath.xpath('i/@class')[0]
                href = a_xpath.xpath('@href')[0]
                # href: lesson/1330#mid=41093
                # 1330是course_id, media_id相当于lesson_id
                media_id = href.split("=")[-1]
                media_id = int(media_id)
                lesson = dict(lesson_name=lesson_name,
                              href=href,
                              media_id=media_id,
                              type_icon=type_icon)
                lessons.append(lesson)
            chapter_info = {"chapter_title": chapter_title, "lessons": lessons}
            chapter_infos.append(chapter_info)
        aid_infos = self.get_aid_infos(html)
        data = dict(title=title,
                    introduction=introduction,
                    course_id=self.course_id,
                    chapters=chapter_infos,
                    aid_infos=aid_infos)
        data_json = json.dumps(data, ensure_ascii=False, indent=2)
        with open(self.course_info_json_path, "w", encoding="utf8") as f:
            f.write(data_json)
        return data