def extract_rdf_files(rdf_tarball, rdf_path): if path(rdf_path).exists(): logger.info("\tRDF-files folder already exists in {}".format(rdf_path)) return logger.info("\tExtracting {} into {}".format(rdf_tarball, rdf_path)) # create destdir if not exists dest = path(rdf_path) dest.mkdir_p() cmd = "tar -C {dest} --strip-components 2 -x -f {tarb}".format( dest=rdf_path, tarb=rdf_tarball) exec_cmd(cmd) return
def optimize_epub(src, dst): logger.info("\t\tCreating ePUB at {}".format(dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(path_for_cmd(fnp)) if path(fname).ext in ('.htm', '.html'): f = open(fnp, 'r') html = update_html_for_static(book=book, html_content=f.read(), epub=True) f.close() with open(fnp, 'w') as f: f.write(html) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" f = open(fnp, 'r') ncx = f.read() f.close() soup = BeautifulSoup(ncx, ["lxml", "xml"]) for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling with open(fnp, 'w') as f: f.write(soup.encode()) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, str(book.id), 'content.opf') if os.path.exists(opff): with open(opff, 'r') as fd: soup = BeautifulSoup(fd.read(), ["lxml", "xml"]) for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() with (open(opff, 'w')) as fd: fd.write(soup.encode()) with cd(tmpd): exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst))) exec_cmd('zip -qXr9D "{dst}" {files}'.format( dst=path_for_cmd(dst), files=" ".join( [f for f in zipped_files if not f == 'mimetype']))) path(tmpd).rmtree_p()
def optimize_jpeg(fpath): exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath))
def optimize_png(fpath): pngquant = 'pngquant --nofs --force --ext=".png" "{path}"' advdef = 'advdef -z -4 -i 5 "{path}"' exec_cmd(pngquant.format(path=fpath)) exec_cmd(advdef.format(path=fpath))
def optimize_gif(fpath): exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath))
def build_zimfile(static_folder, zim_path=None, languages=[], formats=[], title=None, description=None, only_books=[]): if not languages: languages = ['mul'] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = ("Project Gutenberg Library with {formats}" .format(formats=",".join(formats))) else: title = ("Project Gutenberg Library ({langs}) with {formats}" .format(langs=",".join(languages), formats=",".join(formats))) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" if zim_path is None: if len(languages) > 1: zim_path = "gutenberg_all_{date}.zim".format( date=datetime.datetime.now().strftime('%m_%Y')) else: zim_path = "gutenberg_{lang}_all_{date}.zim".format( lang=languages[0], date=datetime.datetime.now().strftime('%Y-%m')) languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() context = { 'languages': ','.join(languages), 'title': title, 'description': description, 'creator': 'gutenberg.org', 'publisher': 'Kiwix', 'home': 'Home.html', 'favicon': 'favicon.png', 'static': static_folder, 'zim': zim_path } cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" ' '--language=\\"{languages}\\" --title=\\"{title}\\" ' '--description=\\"{description}\\" ' '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"' .format(**context)) logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd))) if exec_cmd(cmd): logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def optimize_epub(src, dst): logger.info("\t\tCreating ePUB at {}".format(dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(path_for_cmd(fnp)) if path(fname).ext in ('.htm', '.html'): f = open(fnp, 'r') html = update_html_for_static(book=book, html_content=f.read(), epub=True) f.close() with open(fnp, 'w') as f: f.write(html) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" f = open(fnp, 'r') ncx = f.read() f.close() soup = BeautifulSoup(ncx, ["lxml", "xml"]) for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling with open(fnp, 'w') as f: f.write(soup.encode()) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, str(book.id), 'content.opf') if os.path.exists(opff): with open(opff, 'r') as fd: soup = BeautifulSoup(fd.read(), ["lxml", "xml"]) for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() with(open(opff, 'w')) as fd: fd.write(soup.encode()) with cd(tmpd): exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst))) exec_cmd('zip -qXr9D "{dst}" {files}' .format(dst=path_for_cmd(dst), files=" ".join([f for f in zipped_files if not f == 'mimetype']))) path(tmpd).rmtree_p()