def setup_database(wipe=False): logger.info("Setting up the database") for model in (License, Format, Author, Book, BookFormat): if wipe: model.drop_table(fail_silently=True) if not model.table_exists(): model.create_table() logger.debug("Created table for {}".format(model._meta.name)) load_fixtures(model) else: logger.debug("{} table already exists.".format(model._meta.name))
def handle_companion_file(fname, dstfname=None, book=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) # optimization based on mime/extension if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'): copy_from_cache(src, dst) optimize_image(dst) elif path(fname).ext == '.epub': tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub') tmp_epub.close() optimize_epub(src, tmp_epub.name) path(tmp_epub.name).move(dst) else: # PDF mostly logger.debug("\t\tshitty ext: {}".format(dst)) copy_from_cache(src, dst)
def handle_companion_file(fname, dstfname=None, book=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) # optimization based on mime/extension if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'): copy_from_cache(src, dst) optimize_image(path_for_cmd(dst)) elif path(fname).ext == '.epub': tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() optimize_epub(src, tmp_epub.name) path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) copy_from_cache(src, dst)
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info("\tDownloading content files for Book #{id}" .format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}" .format(fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip'] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}" .format(format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}" .format(format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while(urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp ; pp(allurls) continue
def export_all_books(static_folder, download_cache, languages=[], formats=[], only_books=[]): # ensure dir exist path(static_folder).mkdir_p() books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) sz = len(list(books)) logger.debug("\tFiltered book collection size: {}".format(sz)) def nb_by_fmt(fmt): return sum([ 1 for book in books if BookFormat.select(BookFormat, Book, Format).join(Book).switch( BookFormat).join(Format).where(Book.id == book.id).where( Format.mime == FORMAT_MATRIX.get(fmt)).count() ]) logger.debug("\tFiltered book collection, PDF: {}".format( nb_by_fmt('pdf'))) logger.debug("\tFiltered book collection, ePUB: {}".format( nb_by_fmt('epub'))) logger.debug("\tFiltered book collection, HTML: {}".format( nb_by_fmt('html'))) # export to JSON helpers export_to_json_helpers(books=books, static_folder=static_folder, languages=languages, formats=formats) # copy CSS/JS/* to static_folder src_folder = tmpl_path() for fname in ('css', 'js', 'jquery', 'favicon.ico', 'favicon.png', 'jquery-ui', 'datatables', 'fonts', 'l10n'): src = os.path.join(src_folder, fname) dst = os.path.join(static_folder, fname) if not path(fname).ext: path(dst).rmtree_p() path(src).copytree(dst) else: path(src).copyfile(dst) # export homepage template = jinja_env.get_template('index.html') context = get_default_context(books=books) context.update({'show_books': True}) with open(os.path.join(static_folder, 'Home.html'), 'w') as f: f.write(template.render(**context).encode('utf-8')) # Compute popularity popbooks = books.order_by(Book.downloads.desc()) stars_limits = [0] * NB_POPULARITY_STARS stars = NB_POPULARITY_STARS nb_downloads = popbooks[0].downloads for ibook in range(0, popbooks.count(), 1): if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks.count() \ and popbooks[ibook].downloads < nb_downloads: stars_limits[stars - 1] = nb_downloads stars = stars - 1 nb_downloads = popbooks[ibook].downloads # export to HTML cached_files = os.listdir(download_cache) for book in books: book.popularity = sum([ int(book.downloads >= stars_limits[i]) for i in range(NB_POPULARITY_STARS) ]) export_book_to(book=book, static_folder=static_folder, download_cache=download_cache, cached_files=cached_files, languages=languages, formats=formats, books=books)
def build_zimfile(static_folder, zim_path=None, languages=[], formats=[], title=None, description=None, only_books=[]): if not languages: languages = ['mul'] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = ("Project Gutenberg Library with {formats}" .format(formats=",".join(formats))) else: title = ("Project Gutenberg Library ({langs}) with {formats}" .format(langs=",".join(languages), formats=",".join(formats))) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" if zim_path is None: if len(languages) > 1: zim_path = "gutenberg_all_{date}.zim".format( date=datetime.datetime.now().strftime('%m_%Y')) else: zim_path = "gutenberg_{lang}_all_{date}.zim".format( lang=languages[0], date=datetime.datetime.now().strftime('%Y-%m')) languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() context = { 'languages': ','.join(languages), 'title': title, 'description': description, 'creator': 'gutenberg.org', 'publisher': 'Kiwix', 'home': 'Home.html', 'favicon': 'favicon.png', 'static': static_folder, 'zim': zim_path } cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" ' '--language=\\"{languages}\\" --title=\\"{title}\\" ' '--description=\\"{description}\\" ' '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"' .format(**context)) logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd))) if exec_cmd(cmd): logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = [ 'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip' ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while (urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error( "ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp pp(allurls) continue
def load_fixtures(model): logger.info("Loading fixtures for {}".format(model._meta.name)) for fixture in getattr(model._meta, 'fixtures', []): f = model.create(**fixture) logger.debug("[fixtures] Created {}".format(f))
def export_all_books(static_folder, download_cache, languages=[], formats=[], only_books=[]): # ensure dir exist path(static_folder).mkdir_p() books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) sz = len(list(books)) logger.debug("\tFiltered book collection size: {}".format(sz)) def nb_by_fmt(fmt): return sum([1 for book in books if BookFormat.select(BookFormat, Book, Format) .join(Book).switch(BookFormat) .join(Format) .where(Book.id == book.id) .where(Format.mime == FORMAT_MATRIX.get(fmt)) .count()]) logger.debug("\tFiltered book collection, PDF: {}" .format(nb_by_fmt('pdf'))) logger.debug("\tFiltered book collection, ePUB: {}" .format(nb_by_fmt('epub'))) logger.debug("\tFiltered book collection, HTML: {}" .format(nb_by_fmt('html'))) # export to JSON helpers export_to_json_helpers(books=books, static_folder=static_folder, languages=languages, formats=formats) # copy CSS/JS/* to static_folder src_folder = tmpl_path() for fname in ('css', 'js', 'jquery', 'favicon.ico', 'favicon.png', 'jquery-ui', 'datatables', 'fonts', 'l10n'): src = os.path.join(src_folder, fname) dst = os.path.join(static_folder, fname) if not path(fname).ext: path(dst).rmtree_p() path(src).copytree(dst) else: path(src).copyfile(dst) # export homepage template = jinja_env.get_template('index.html') context = get_default_context(books=books) context.update({'show_books': True}) with open(os.path.join(static_folder, 'Home.html'), 'w') as f: f.write(template.render(**context).encode('utf-8')) # Compute popularity popbooks = books.order_by(Book.downloads.desc()) stars_limits = [0] * NB_POPULARITY_STARS stars = NB_POPULARITY_STARS nb_downloads = popbooks[0].downloads for ibook in range(0, popbooks.count(), 1): if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks.count() \ and popbooks[ibook].downloads < nb_downloads: stars_limits[stars-1] = nb_downloads stars = stars - 1 nb_downloads = popbooks[ibook].downloads # export to HTML cached_files = os.listdir(download_cache) for book in books: book.popularity = sum( [int(book.downloads >= stars_limits[i]) for i in range(NB_POPULARITY_STARS)]) export_book_to(book=book, static_folder=static_folder, download_cache=download_cache, cached_files=cached_files, languages=languages, formats=formats, books=books)