def make_column_ebook(column_id, column_title, output_dir): db_url = os.path.join(output_dir, 'sqlite3.db') # start spider start_url = 'https://time.geekbang.org/serv/v1/column/articles' json = {"cid": str(column_id), "size": 1000, "prev": 0, "order": "newest"} headers = { 'Content-Type': 'application/json', 'Referer': 'https://time.geekbang.org/column/{}'.format(column_id) } geektime_spider = spider.get_spider(backend_db_url=db_url, start_url=start_url, headers=headers, json=json) geektime_spider.add_url( url='https://time.geekbang.org/serv/v1/column/intro', headers=headers, json={'cid': str(column_id)}) geektime_spider.start_crawl() # generate source files source_dir = os.path.join(output_dir, str(column_id)) render_column_source_files(column_id, column_title, source_dir, source_db_path=db_url) # generate ebook make_mobi(source_dir=source_dir, output_dir=output_dir)
def run(self, cfg: dict) -> None: course_ids = self.parse_course_ids(cfg['course_ids']) output_folder = self._format_output_folder(cfg) dc = self.get_data_client(cfg) for course_id in course_ids: try: course_intro = dc.get_course_intro(course_id, force=True) except GkApiError as e: sys.stderr.write('{}\n\n'.format(e)) continue if int(course_intro['column_type']) not in (1, 2): sys.stderr.write("ERROR: 该课程不提供文本:{}".format( course_intro['column_title'])) continue course_intro['column_title'] = Render.format_file_name( course_intro['column_title']) # fetch raw data print( colored( '开始制作电子书:{}-{}'.format(course_id, course_intro['column_title']), 'green')) pbar_desc = '数据爬取中:{}'.format(course_intro['column_title'][:10]) data = dc.get_course_content(course_id, force=cfg['force'], pbar_desc=pbar_desc) if cfg['comments_count'] > 0: for post in data: post['article_content'] += self._render_comment_html( post['comments'], cfg['comments_count']) # source file self._render_source_files(course_intro, data, output_folder, force=cfg['force']) # ebook 未完结或者 force 都会重新制作电子书 ebook_name = self._format_title(course_intro) fn = os.path.join(output_folder, ebook_name) + '.mobi' if (not cfg['force'] and self.is_course_finished(course_intro) and os.path.isfile(fn)): sys.stdout.write("{} exists\n".format(ebook_name)) else: src_dir = os.path.join(output_folder, course_intro['column_title']) make_mobi(source_dir=src_dir, output_dir=output_folder) # push to kindle if cfg['push']: self._send_to_kindle(cfg, fn) sys.stdout.write("{} 已推送到 kindle\n\n".format(ebook_name))
def run(self, args): course_id = args[0] for arg in args[1:]: if '--out-dir=' in arg: out_dir = arg.split('--out-dir=')[1] or './ebook' break else: out_dir = './ebook' force = '--force' in args[1:] enable_comments = '--enable-comments' in args[1:] source_only = '--source-only' in args[1:] for arg in args[1:]: if '--comment-count=' in arg: comment_count = arg.split('--comment-count=')[1] or 10 break else: comment_count = 10 if not os.path.isdir(out_dir): os.makedirs(out_dir) dc = DataClient() course_data = dc.get_course_intro(course_id) if int(course_data['column_type']) not in (1, 2): raise Exception('该课程不提供文本:%s' % course_data['column_title']) # data data = dc.get_course_content(course_id, force=force) if enable_comments: for post in data: post['article_content'] += self._render_comment_html( post['comments'], comment_count) # source file course_data['column_title'] = maker.format_file_name( course_data['column_title']) self.render_column_source_files(course_data, data, out_dir, force=force) # ebook if not source_only: if course_data['update_frequency'] == '全集' and os.path.isfile( os.path.join(out_dir, self._title(course_data)) + '.mobi'): print("{} exists ".format(self._title(course_data))) else: make_mobi(source_dir=os.path.join(out_dir, course_data['column_title']), output_dir=out_dir)
def run(self, cfg: dict) -> None: # from ipdb import set_trace;set_trace() course_id = cfg['course_id'] if not course_id: sys.stderr.write("ERROR: couldn't find the target course id\n") return out_dir = os.path.join(cfg['output_folder'], 'ebook') if not os.path.isdir(out_dir): try: os.makedirs(out_dir) except OSError: sys.stderr.write("ERROR: couldn't create the output folder {}\n".format(out_dir)) return try: dc = get_data_client(cfg) except: sys.stderr.write("ERROR: invalid geektime account or password\n" "Use '%s <command> login --help' for help.\n" % sys.argv[0].split(os.path.sep)[-1]) return course_data = dc.get_course_intro(course_id, force=True) if int(course_data['column_type']) not in (1, 2): sys.stderr.write("ERROR: 该课程不提供文本:%s" % course_data['column_title']) return # data data = dc.get_course_content(course_id, force=cfg['force']) if cfg['enable_comments']: for post in data: post['article_content'] += self._render_comment_html(post['comments'], cfg['comments_count']) # source file course_data['column_title'] = maker.format_file_name(course_data['column_title']) self._render_column_source_files(course_data, data, out_dir, force=cfg['force']) # ebook if not cfg['source_only']: if course_data['update_frequency'] == '全集' and os.path.isfile(os.path.join(out_dir, self._title(course_data)) + '.mobi'): sys.stdout.write("{} exists\n".format(self._title(course_data))) else: make_mobi(source_dir=os.path.join(out_dir, course_data['column_title']), output_dir=out_dir) # push to kindle if cfg['push'] and not cfg['source_only']: fn = os.path.join(out_dir, "{}.mobi".format(self._title(course_data))) try: send_to_kindle(fn, cfg) sys.stdout.write("push to kindle done\n") except Exception as e: sys.stderr.write("ERROR: push to kindle failed, e={}\n".format(e))
def test_make_ebook(): src = './examples/source' dst = './examples/' mobi = make_mobi(src, dst) assert os.path.isfile(mobi) os.remove(mobi)
def test_make_ebook(): path = pathlib.Path(__file__).parent.parent / 'examples' src = str(path / 'source') dst = str(path) mobi = make_mobi(src, dst) assert os.path.isfile(mobi) os.remove(mobi)
def run(self, args): course_id = args[0] for arg in args[1:]: if '--out-dir=' in arg: out_dir = arg.split('--out-dir=')[1] or './ebook' break else: out_dir = './ebook' force = '--force' in args[1:] enable_comments = '--enable-comments' in args[1:] source_only = '--source-only' in args[1:] push = '--push' in args[1:] for arg in args[1:]: if '--comment-count=' in arg: comment_count = arg.split('--comment-count=')[1] or 10 break else: comment_count = 10 if not os.path.isdir(out_dir): os.makedirs(out_dir) dc = DataClient() course_data = dc.get_course_intro(course_id) if int(course_data['column_type']) not in (1, 2): raise Exception('该课程不提供文本:%s' % course_data['column_title']) # data data = dc.get_course_content(course_id, force=force) if enable_comments: for post in data: post['article_content'] += self._render_comment_html( post['comments'], comment_count) # source file course_data['column_title'] = maker.format_file_name( course_data['column_title']) self.render_column_source_files(course_data, data, out_dir, force=force) # ebook if not source_only: if course_data['update_frequency'] == '全集' and os.path.isfile( os.path.join(out_dir, self._title(course_data)) + '.mobi'): print("{} exists ".format(self._title(course_data))) else: make_mobi(source_dir=os.path.join(out_dir, course_data['column_title']), output_dir=out_dir) if push: fn = os.path.join(out_dir, "{}.mobi".format(self._title(course_data))) if os.path.getsize(fn) / 1024.0 / 1024 > 50: print("电子书大小超过50M") return f = open(fn, 'rb') d = f.read() f.close() with open('smtp.conf') as f: smtp_conf = json.loads(f.read()) m = MailServer(host=smtp_conf['host'], port=smtp_conf['port'], user=smtp_conf['user'], password=smtp_conf['password'], encryption=smtp_conf['encryption']) message = m.build_email(email_to=smtp_conf['email_to'], subject='convert', body='', attachments=[("{}.mobi".format( self._title(course_data)), d)]) m.send_email(message) print("push to kindle done")
def run(self, cfg: dict) -> None: course_id = cfg['course_id'] if not course_id: sys.stderr.write("ERROR: couldn't find the target course id\n") return out_dir = os.path.join(cfg['output_folder'], 'ebook') out_dir = os.path.expanduser(out_dir) if not os.path.isdir(out_dir): try: os.makedirs(out_dir) except OSError: sys.stderr.write( "ERROR: couldn't create the output folder {}\n".format( out_dir)) return try: dc = get_data_client(cfg) except Exception: sys.stderr.write("ERROR: invalid geektime account or password\n" "Use '{} login --help' for help.\n".format( sys.argv[0].split(os.path.sep)[-1])) return course_data = dc.get_course_intro(course_id, force=True) if int(course_data['column_type']) not in (1, 2): sys.stderr.write("ERROR: 该课程不提供文本:{}".format( course_data['column_title'])) return # data sys.stdout.write('doing ......\n') data = dc.get_course_content(course_id, force=cfg['force']) if cfg['enable_comments']: for post in data: post['article_content'] += self._render_comment_html( post['comments'], cfg['comments_count']) # source file course_data['column_title'] = Render.format_file_name( course_data['column_title']) self._render_source_files(course_data, data, out_dir, force=cfg['force']) # ebook ebook_name = self._title(course_data) if not cfg['source_only']: fn = os.path.join(out_dir, ebook_name) + '.mobi' if course_data['is_finish'] and os.path.isfile(fn): sys.stdout.write("{} exists\n".format(ebook_name)) else: src_dir = os.path.join(out_dir, course_data['column_title']) make_mobi(source_dir=src_dir, output_dir=out_dir) # push to kindle if cfg['push'] and not cfg['source_only']: fn = os.path.join(out_dir, "{}.mobi".format(ebook_name)) try: send_to_kindle(fn, cfg) sys.stdout.write("push to kindle done\n") except Exception as e: sys.stderr.write( "ERROR: push to kindle failed, e={}\n".format(e))
import os from kindle_maker import make_mobi def main(): rootDir = r"" destDir = r"" chapterList = os.listdir(rootDir) chapter2LectureMapper = {} sourceList = [] for chapter in chapterList: chapterPath = os.path.join(rootDir, chapter) lectureList = os.listdir(chapterPath) if __name__ == "__main__": # main() make_mobi(r"E:\24-Java并发编程实战(完结)\03-第一部分:并发理论基础 (13讲)", r"E:\24-Java并发编程实战(完结)\Test")