Ejemplo n.º 1
0
    def run(self, cfg: dict):

        course_id = cfg['course_id']
        if not course_id:
            sys.stderr.write("ERROR: couldn't find the target course id\n")
            return

        out_dir = os.path.join(cfg['output_folder'], 'mp3')
        out_dir = os.path.expanduser(out_dir)
        if not os.path.isdir(out_dir):
            try:
                os.makedirs(out_dir)
            except OSError:
                sys.stderr.write("ERROR: couldn't create the output folder {}\n".format(out_dir))
                return

        url_only = cfg['url_only']

        try:
            dc = get_data_client(cfg)
        except:
            sys.stderr.write("ERROR: invalid geektime account or password\n"
                             "Use '%s login --help' for  help.\n" % sys.argv[0].split(os.path.sep)[-1])
            return

        course_data = dc.get_course_intro(course_id)
        if int(course_data['column_type']) != 1:
            raise Exception('该课程不提供音频:%s' % course_data['column_title'])

        out_dir = os.path.join(out_dir, course_data['column_title'])
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir)

        sys.stdout.write('doing ......\n')
        data = dc.get_course_content(course_id)

        if url_only:
            title = EbookRender.format_file_name(course_data['column_title'])
            with open(os.path.join(out_dir, '%s.mp3.txt' % title), 'w') as f:
                # TODO alignment
                f.write('\n'.join(["{}:\t\t{}".format(
                    EbookRender.format_file_name(post['article_title']),
                    post['audio_download_url']
                ) for post in data]))
                sys.stdout.write('download {} mp3 url done\n'.format(title))
            return

        dl = Downloader()
        for post in data:
            file_name = EbookRender.format_file_name(post['article_title']) + '.mp3'
            if os.path.isfile(os.path.join(out_dir, file_name)):
                sys.stdout.write(file_name + ' exists\n')
                continue
            if post['audio_download_url']:
                dl.run(post['audio_download_url'], out_file=file_name, out_dir=out_dir)
                sys.stdout.write('download mp3 {} done\n'.format(file_name))
Ejemplo n.º 2
0
 def _parse_and_save_url(course_intro: dict,
                         course_data: list, out_dir: str):
     title = Render.format_file_name(course_intro['column_title'])
     fn = os.path.join(out_dir, '{}.mp3.txt'.format(title))
     with open(fn, 'w') as f:
         f.write('\n'.join(["{}:\t\t{}".format(
             Render.format_file_name(post['article_title']),
             post['audio_download_url']
         ) for post in course_data]))
     sys.stdout.write('音频链接下载完成:{}\n\n'.format(fn))
Ejemplo n.º 3
0
    def run(self, cfg: dict):

        dc = self.get_data_client(cfg)
        course_ids = self.parse_course_ids(cfg['course_ids'], dc)
        output_folder = self._format_output_folder(cfg)

        dl = Downloader(output_folder, workers=cfg['workers'])

        for course_id in course_ids:
            try:
                course_data = dc.get_course_intro(course_id)
            except GkApiError as e:
                sys.stderr.write('{}\n\n'.format(e))
                continue
            if int(course_data['column_type']) != 3:
                sys.stderr.write('该课程不是视频课程:{} {}\n\n'.format(
                    course_id, course_data['column_title']))
                continue

            out_dir = os.path.join(
                output_folder,
                Render.format_file_name(course_data['column_title']))
            if not os.path.isdir(out_dir):
                os.makedirs(out_dir)

            # fetch raw data
            print(
                colored(
                    '开始下载视频:{}-{}'.format(course_id,
                                          course_data['column_title']),
                    'green'))
            pbar_desc = '数据爬取中:{}'.format(course_data['column_title'][:10])
            data = dc.get_course_content(course_id, pbar_desc=pbar_desc)

            # save url
            if cfg['url_only']:
                self._parse_and_save_url(course_data, data, out_dir)
                continue

            # download mp4
            for post in data:
                fn = (Render.format_file_name(post['article_title']) +
                      ('.hd' if cfg['hd_only'] else '.sd'))
                if os.path.isfile(os.path.join(out_dir, fn) + '.ts'):
                    sys.stdout.write(fn + ' exists\n')
                    continue
                url = self._parse_url(post, cfg['hd_only'])
                if url:
                    dl.run(url, os.path.join(out_dir, fn))
        dl.shutdown()
Ejemplo n.º 4
0
    def _parse_and_save_url(course_intro, course_data, out_dir):
        title = Render.format_file_name(course_intro['column_title'])
        fn = os.path.join(out_dir, '{}.mp4.txt'.format(title))
        with open(fn, 'w') as f:
            f.write('\n'.join([
                "{}:\n{}\n{}\n\n".format(
                    Render.format_file_name(post['article_title']),
                    (post.get('video_media_map') or {}).get('hd',
                                                            {}).get('url'),
                    (post.get('video_media_map')
                     or {}).get('sd', {}).get('url')) for post in course_data
            ]))

        sys.stdout.write('视频链接下载完成:{}\n\n'.format(fn))
Ejemplo n.º 5
0
    def _render_source_files(self,
                             course_intro: dict,
                             course_content: list,
                             out_dir: str,
                             force: bool = False,
                             **kwargs) -> None:
        """
        下载课程源文件
        """
        articles = course_content
        column_title = course_intro['column_title']
        _out_dir = os.path.join(out_dir, column_title)
        if not os.path.isdir(_out_dir):
            os.makedirs(_out_dir)

        render = Render(_out_dir)
        # introduction
        if not force and os.path.isfile(os.path.join(_out_dir, '简介.html')):
            sys.stdout.write('{}简介 exists\n'.format(column_title))
        else:
            render.render_article_html('简介', course_intro['column_intro'],
                                       **kwargs)
            sys.stdout.write('下载{}简介 done\n'.format(column_title))
        # cover
        if not force and os.path.isfile(os.path.join(_out_dir, 'cover.jpg')):
            sys.stdout.write('{}封面 exists\n'.format(column_title))
        else:
            render.generate_cover_img(course_intro['column_cover'])
            sys.stdout.write('下载{}封面 done\n'.format(column_title))
        # toc
        ebook_name = self._format_title(course_intro)
        render.render_toc_md(
            ebook_name, ['简介'] +
            [render.format_file_name(t['article_title']) for t in articles])
        sys.stdout.write('下载{}目录 done\n'.format(column_title))
        # articles
        articles = tqdm(articles)
        for article in articles:
            articles.set_description('HTML 文件下载中:{}'.format(
                article['article_title'][:10]))
            title = render.format_file_name(article['article_title'])
            fn = os.path.join(_out_dir, '{}.html'.format(title))
            if not force and os.path.isfile(fn):
                continue
            render.render_article_html(title, article['article_content'],
                                       **kwargs)
Ejemplo n.º 6
0
    def run(self, cfg: dict) -> None:

        course_ids = self.parse_course_ids(cfg['course_ids'])
        output_folder = self._format_output_folder(cfg)

        dc = self.get_data_client(cfg)

        for course_id in course_ids:
            try:
                course_intro = dc.get_course_intro(course_id, force=True)
            except GkApiError as e:
                sys.stderr.write('{}\n\n'.format(e))
                continue
            if int(course_intro['column_type']) not in (1, 2):
                sys.stderr.write("ERROR: 该课程不提供文本:{}".format(
                    course_intro['column_title']))
                continue
            course_intro['column_title'] = Render.format_file_name(
                course_intro['column_title'])

            # fetch raw data
            print(
                colored(
                    '开始制作电子书:{}-{}'.format(course_id,
                                           course_intro['column_title']),
                    'green'))
            pbar_desc = '数据爬取中:{}'.format(course_intro['column_title'][:10])
            data = dc.get_course_content(course_id,
                                         force=cfg['force'],
                                         pbar_desc=pbar_desc)
            if cfg['comments_count'] > 0:
                for post in data:
                    post['article_content'] += self._render_comment_html(
                        post['comments'], cfg['comments_count'])

            # source file
            self._render_source_files(course_intro,
                                      data,
                                      output_folder,
                                      force=cfg['force'])

            # ebook 未完结或者 force 都会重新制作电子书
            ebook_name = self._format_title(course_intro)
            fn = os.path.join(output_folder, ebook_name) + '.mobi'
            if (not cfg['force'] and self.is_course_finished(course_intro)
                    and os.path.isfile(fn)):
                sys.stdout.write("{} exists\n".format(ebook_name))
            else:
                src_dir = os.path.join(output_folder,
                                       course_intro['column_title'])
                make_mobi(source_dir=src_dir, output_dir=output_folder)

            # push to kindle
            if cfg['push']:
                self._send_to_kindle(cfg, fn)
                sys.stdout.write("{} 已推送到 kindle\n\n".format(ebook_name))
Ejemplo n.º 7
0
    def run(self, cfg: dict) -> None:

        course_id = cfg['course_id']
        if not course_id:
            sys.stderr.write("ERROR: couldn't find the target course id\n")
            return
        out_dir = os.path.join(cfg['output_folder'], 'ebook')
        out_dir = os.path.expanduser(out_dir)
        if not os.path.isdir(out_dir):
            try:
                os.makedirs(out_dir)
            except OSError:
                sys.stderr.write(
                    "ERROR: couldn't create the output folder {}\n".format(
                        out_dir))
                return
        try:
            dc = get_data_client(cfg)
        except:
            sys.stderr.write("ERROR: invalid geektime account or password\n"
                             "Use '%s login --help' for  help.\n" %
                             sys.argv[0].split(os.path.sep)[-1])
            return

        course_data = dc.get_course_intro(course_id, force=True)
        if int(course_data['column_type']) not in (1, 2):
            sys.stderr.write("ERROR: 该课程不提供文本:%s" %
                             course_data['column_title'])
            return

        # data
        sys.stdout.write('doing ......\n')
        data = dc.get_course_content(course_id, force=cfg['force'])
        if cfg['enable_comments']:
            for post in data:
                post['article_content'] += self._render_comment_html(
                    post['comments'], cfg['comments_count'])

        # source file
        course_data['column_title'] = Render.format_file_name(
            course_data['column_title'])
        self._render_source_files(course_data,
                                  data,
                                  out_dir,
                                  force=cfg['force'])

        # ebook
        ebook_name = self._title(course_data)
        if not cfg['source_only']:
            if course_data['is_finish'] and os.path.isfile(
                    os.path.join(out_dir, ebook_name) + '.mobi'):
                sys.stdout.write("{} exists\n".format(ebook_name))
            else:
                make_mobi(source_dir=os.path.join(out_dir,
                                                  course_data['column_title']),
                          output_dir=out_dir)

        # push to kindle
        if cfg['push'] and not cfg['source_only']:
            fn = os.path.join(out_dir, "{}.mobi".format(ebook_name))
            try:
                send_to_kindle(fn, cfg)
                sys.stdout.write("push to kindle done\n")
            except Exception as e:
                sys.stderr.write(
                    "ERROR: push to kindle failed, e={}\n".format(e))
Ejemplo n.º 8
0
    def run(self, cfg: dict):

        course_id = cfg['course_id']
        if not course_id:
            sys.stderr.write("ERROR: couldn't find the target course id\n")
            return

        out_dir = os.path.join(cfg['output_folder'], 'mp4')
        out_dir = os.path.expanduser(out_dir)
        if not os.path.isdir(out_dir):
            try:
                os.makedirs(out_dir)
            except OSError:
                sys.stderr.write(
                    "ERROR: couldn't create the output folder {}\n".format(
                        out_dir))
                return

        url_only = cfg['url_only']
        hd_only = cfg['hd_only']
        workers = cfg['workers']

        try:
            dc = get_data_client(cfg)
        except:
            sys.stderr.write("ERROR: invalid geektime account or password\n"
                             "Use '%s login --help' for  help.\n" %
                             sys.argv[0].split(os.path.sep)[-1])
            return

        course_data = dc.get_course_intro(course_id)

        if int(course_data['column_type']) != 3:
            raise Exception('该课程不是视频课程:%s' % course_data['column_title'])

        out_dir = os.path.join(out_dir, course_data['column_title'])
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir)

        sys.stdout.write('doing ......\n')
        data = dc.get_course_content(course_id)
        if url_only:
            title = EbookRender.format_file_name(course_data['column_title'])
            with open(os.path.join(out_dir, '%s.mp4.txt' % title), 'w') as f:

                f.write('\n'.join([
                    "{}:\n{}\n{}\n\n".format(
                        EbookRender.format_file_name(post['article_title']),
                        post['video_media_map'].get('hd', {}).get('url'),
                        post['video_media_map'].get('sd', {}).get('url'))
                    for post in data
                ]))
            sys.stdout.write('download {} mp4 url done\n'.format(title))
            return

        dl = Downloader()
        p = Pool(workers)
        start = time.time()
        for post in data:
            file_name = EbookRender.format_file_name(
                post['article_title']) + ('.hd' if hd_only else '.sd')
            if os.path.isfile(os.path.join(out_dir, file_name) + '.ts'):
                sys.stdout.write(file_name + ' exists\n')
                continue
            if hd_only:  # some post has sd mp4 only
                url = post['video_media_map'].get(
                    'hd', {}).get('url') or post['video_media'].get(
                        'sd', {}).get('url')
            else:
                url = post['video_media_map'].get('sd', {}).get('url')

            p.apply_async(dl.run, (url, out_dir, file_name))

        p.close()
        p.join()
        sys.stdout.write('download {} done, cost {}s\n'.format(
            course_data['column_title'], int(time.time() - start)))
Ejemplo n.º 9
0
def test_format_path(render: Render):
    fn = 'hell\\'
    formated_fn = render.format_file_name(fn)
    assert formated_fn == 'hell'