Esempio n. 1
0
def parse_and_fill(rdf_path, concurrency, only_books=[], force=False):
    logger.info("\tLooping throught RDF files in {}".format(rdf_path))

    fpaths = []
    for root, dirs, files in os.walk(rdf_path):
        if root.endswith("999999"):
            continue

        # skip books outside of requsted list
        if len(only_books) and path(root).basename() not in [
                str(bid) for bid in only_books
        ]:
            continue

        for fname in files:
            if fname in (".", "..", "pg0.rdf"):
                continue

            if not fname.endswith(".rdf"):
                continue

            fpaths.append(os.path.join(root, fname))

    fpaths = sorted(
        fpaths,
        key=lambda f: int(re.match(r".*/pg([0-9]+).rdf", f).groups()[0]))

    def ppf(x):
        return parse_and_process_file(x, force)

    Pool(concurrency).map(ppf, fpaths)
Esempio n. 2
0
 def copy_file(src, dst):
     logger.info("\t\tCopying {}".format(dst))
     try:
         shutil.copy2(src, dst)
     except IOError:
         logger.error("/!\\ Unable to copy missing file {}".format(src))
         return
Esempio n. 3
0
def download_cover(book, book_dir, s3_storage, optimizer_version):
    has_cover = Book.select(Book.cover_page).where(Book.id == book.id)
    if has_cover:
        # try to download optimized cover from cache if s3_storage
        url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id)
        etag = get_etag_from_url(url)
        downloaded_from_cache = False
        cover = "{}_cover_image.jpg".format(book.id)
        if (book_dir.joinpath("optimized").joinpath(cover).exists()
                or book_dir.joinpath("unoptimized").joinpath(cover).exists()):
            logger.debug(f"Cover already exists for book #{book.id}")
            return
        if s3_storage:
            logger.info(
                f"Trying to download cover for {book.id} from optimization cache"
            )
            downloaded_from_cache = download_from_cache(
                book=book,
                etag=etag,
                book_format="cover",
                dest_dir=book_dir.joinpath("optimized"),
                s3_storage=s3_storage,
                optimizer_version=optimizer_version,
            )
        if not downloaded_from_cache:
            logger.debug("Downloading {}".format(url))
            if download_file(url,
                             book_dir.joinpath("unoptimized").joinpath(cover)):
                book.cover_etag = etag
                book.save()
    else:
        logger.debug("No Book Cover found for Book #{}".format(book.id))
Esempio n. 4
0
def download_rdf_file(rdf_url):
    fname = "rdf-files.tar.bz2"

    if path(fname).exists():
        logger.info("\tdf-files.tar.bz2 already exists in {}".format(fname))
        return fname

    logger.info("\tDownloading {} into {}".format(rdf_url, fname))
    download_file(rdf_url, pathlib.Path(fname).resolve())

    return fname
Esempio n. 5
0
 def optimize_image(src, dst, force=False):
     if path(dst).exists() and not force:
         logger.info("\tSkipping image optimization for {}".format(dst))
         return dst
     logger.info("\tOptimizing image {}".format(dst))
     if path(src).ext == '.png':
         return optimize_png(src, dst)
     if path(src).ext in ('.jpg', '.jpeg'):
         return optimize_jpeg(src, dst)
     if path(src).ext == '.gif':
         return optimize_gif(src, dst)
     return dst
Esempio n. 6
0
 def symlink_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tSymlinking {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).link(dst)  # hard link
     except IOError:
         logger.error("/!\ Unable to symlink missing file {}".format(src))
         return
Esempio n. 7
0
 def optimize_image(src, dst, force=False):
     if path(dst).exists() and not force:
         logger.info("\tSkipping image optimization for {}".format(dst))
         return dst
     logger.info("\tOptimizing image {}".format(dst))
     if path(src).ext == '.png':
         return optimize_png(src, dst)
     if path(src).ext in ('.jpg', '.jpeg'):
         return optimize_jpeg(src, dst)
     if path(src).ext == '.gif':
         return optimize_gif(src, dst)
     return dst
Esempio n. 8
0
def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat, Url):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))
Esempio n. 9
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
Esempio n. 10
0
 def optimize_image(src, dst, force=False):
     if dst.exists() and not force:
         logger.info("\tSkipping image optimization for {}".format(dst))
         return dst
     logger.info("\tOptimizing image {}".format(dst))
     if src.suffix == ".png":
         return optimize_png(str(src.resolve()), str(dst.resolve()))
     if src.suffix in (".jpg", ".jpeg"):
         return optimize_jpeg(str(src.resolve()), str(dst.resolve()))
     if src.suffix == ".gif":
         return optimize_gif(str(src.resolve()), str(dst.resolve()))
     return dst
Esempio n. 11
0
def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat, Url):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))
Esempio n. 12
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\\ Unable to copy missing file {}".format(src))
         return
Esempio n. 13
0
def extract_rdf_files(rdf_tarball, rdf_path, force=False):
    if path(rdf_path).exists() and not force:
        logger.info("\tRDF-files folder already exists in {}".format(rdf_path))
        return

    logger.info("\tExtracting {} into {}".format(rdf_tarball, rdf_path))

    # create destdir if not exists
    dest = path(rdf_path)
    dest.mkdir_p()

    exec_cmd([
        "tar", "-C", rdf_path, "--strip-components", "2", "-x", "-f",
        rdf_tarball
    ])
    return
Esempio n. 14
0
def write_book_presentation_article(
    static_folder, book, force, project_id, title_search, add_bookshelves, books
):
    cover_fpath = static_folder.joinpath(article_name_for(book=book, cover=True))
    if not cover_fpath.exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(
            book=book,
            static_folder=static_folder,
            books=books,
            project_id=project_id,
            title_search=title_search,
            add_bookshelves=add_bookshelves,
        )
        with open(cover_fpath, "w") as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))
Esempio n. 15
0
def build_zimfile(static_folder, zim_path=None,
                  languages=[], formats=[],
                  title=None, description=None,
                  only_books=[],
                  create_index=True, force=False):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(static_folder=static_folder, dev_mode=False,
                    languages=languages, formats=formats,
                    only_books=only_books)

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}"
                     .format(formats=",".join(formats)))
        else:
            title = ("Project Gutenberg Library ({langs}) with {formats}"
                     .format(langs=",".join(languages),
                             formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_path is None:
        zim_path = "{}.zim".format(project_id)

    if path(zim_path).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_path))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = ['zimwriterfs',
           '--welcome', "Home.html",
           '--favicon', "favicon.png",
           '--language', ','.join(languages),
           '--name', project_id,
           '--title', title,
           '--description', description,
           '--creator', "gutenberg.org",
           '--publisher', "Kiwix",
           static_folder, zim_path]

    if create_index:
            cmd.insert(1, '--withFullTextIndex')
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Esempio n. 16
0
def parse_and_process_file(rdf_file, force=False):
    if not path(rdf_file).exists():
        raise ValueError(rdf_file)

    gid = re.match(r".*/pg([0-9]+).rdf", rdf_file).groups()[0]

    if Book.get_or_none(id=int(gid)):
        logger.info("\tSkipping already parsed file {}".format(rdf_file))
        return

    logger.info("\tParsing file {}".format(rdf_file))
    with open(rdf_file, "r") as f:
        parser = RdfParser(f.read(), gid).parse()

    if parser.license == "None":
        logger.info(
            "\tWARN: Unusable book without any information {}".format(gid))
    elif parser.title == "":
        logger.info("\tWARN: Unusable book without title {}".format(gid))
    else:
        save_rdf_in_database(parser)
Esempio n. 17
0
    def handle_companion_file(fname,
                              dstfname=None,
                              book=None,
                              force=False,
                              as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info(
                "\t\tCopying and optimizing image companion {}".format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub",
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname,
                                      dstfname,
                                      book,
                                      force,
                                      as_ext="zip")
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)
Esempio n. 18
0
    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)
Esempio n. 19
0
    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
Esempio n. 20
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()
Esempio n. 21
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))
Esempio n. 22
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()
Esempio n. 23
0
def export_to_json_helpers(books, static_folder, languages,
                           formats, project_id):

    def dumpjs(col, fn, var='json_data'):
        with open(os.path.join(static_folder, fn), 'w') as f:
            f.write("var {var} = ".format(var=var))
            f.write(json.dumps(col))
            f.write(";")
            # json.dump(col, f)

    # all books sorted by popularity
    logger.info("\t\tDumping full_by_popularity.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.downloads.desc())],
           'full_by_popularity.js')

    # all books sorted by title
    logger.info("\t\tDumping full_by_title.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.title.asc())],
           'full_by_title.js')

    avail_langs = get_langs_with_count(books=books)

    all_filtered_authors = []

    # language-specific collections
    for lang_name, lang, lang_count in avail_langs:
        lang_filtered_authors = list(
            set([book.author.gut_id for book in books.filter(language=lang)]))
        for aid in lang_filtered_authors:
            if aid not in all_filtered_authors:
                all_filtered_authors.append(aid)

        # by popularity
        logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.downloads.desc())],
            'lang_{}_by_popularity.js'.format(lang))
        # by title
        logger.info("\t\tDumping lang_{}_by_title.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.title.asc())],
            'lang_{}_by_title.js'.format(lang))

        authors = authors_from_ids(lang_filtered_authors)
        logger.info("\t\tDumping authors_lang_{}.js".format(lang))
        dumpjs([author.to_array() for author in authors],
               'authors_lang_{}.js'.format(lang), 'authors_json_data')

    # author specific collections
    authors = authors_from_ids(all_filtered_authors)
    for author in authors:

        # all_filtered_authors.remove(author.gut_id)
        # by popularity
        logger.info(
            "\t\tDumping auth_{}_by_popularity.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.downloads.desc())],
            'auth_{}_by_popularity.js'.format(author.gut_id))
        # by title
        logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.title.asc())],
            'auth_{}_by_title.js'.format(author.gut_id))
        # by language
        for lang_name, lang, lang_count in avail_langs:
            logger.info("\t\tDumping auth_{}_by_lang_{}.js"
                        .format(author.gut_id, lang))
            dumpjs(
                [book.to_array()
                 for book in books.where(Book.language == lang)
                                  .where(Book.author == author)
                                  .order_by(Book.downloads.desc())],
                'auth_{}_lang_{}_by_popularity.js'.format(author.gut_id, lang))

            dumpjs(
                [book.to_array()
                 for book in books.where(Book.language == lang)
                                  .where(Book.author == author)
                                  .order_by(Book.title.asc())],
                'auth_{}_lang_{}_by_title.js'.format(author.gut_id, lang))

        # author HTML redirect file
        save_author_file(author, static_folder, books, project_id, force=True)

    # authors list sorted by name
    logger.info("\t\tDumping authors.js")
    dumpjs([author.to_array() for author in authors],
           'authors.js', 'authors_json_data')

    # languages list sorted by code
    logger.info("\t\tDumping languages.js")
    dumpjs(avail_langs, 'languages.js', 'languages_json_data')

    # languages by weight
    main_languages, other_languages = get_lang_groups(books)
    logger.info("\t\tDumping main_languages.js")
    dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data')
    dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
Esempio n. 24
0
def download_book(book, download_cache, languages, formats, force):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    for format in formats:

        fpath = os.path.join(download_cache, fname_for(book, format))

        # check if already downloaded
        if path(fpath).exists() and not force:
            logger.debug("\t\t{fmt} already exists at {path}".format(
                fmt=format, path=fpath))
            continue

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                format, book.id, book.title).encode("utf-8"))
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            format, book.id, book.title).encode("utf-8"))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

        import copy

        allurls = copy.copy(urls)

        while urls:
            url = urls.pop()

            if len(allurls) != 1:
                if not resource_exists(url):
                    continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = "{}.zip".format(fpath)

                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue

                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   download_cache=download_cache)
            else:
                if not download_file(url, fpath):
                    logger.error("file donwload failed: {}".format(fpath))
                    continue

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()

        if not bf.downloaded_from:
            logger.error("NO FILE FOR #{}/{}".format(book.id, format))
            pp(allurls)
            continue
Esempio n. 25
0
def export_to_json_helpers(
    books, static_folder, languages, formats, project_id, title_search, add_bookshelves
):
    def dumpjs(col, fn, var="json_data"):
        with open(os.path.join(static_folder, fn), "w") as f:
            f.write("var {var} = ".format(var=var))
            f.write(json.dumps(col))
            f.write(";")
            # json.dump(col, f)

    # all books sorted by popularity
    logger.info("\t\tDumping full_by_popularity.js")
    dumpjs(
        [book.to_array() for book in books.order_by(Book.downloads.desc())],
        "full_by_popularity.js",
    )

    # all books sorted by title
    logger.info("\t\tDumping full_by_title.js")
    dumpjs(
        [book.to_array() for book in books.order_by(Book.title.asc())],
        "full_by_title.js",
    )

    avail_langs = get_langs_with_count(books=books)

    all_filtered_authors = []

    # language-specific collections
    for lang_name, lang, lang_count in avail_langs:
        lang_filtered_authors = list(
            set([book.author.gut_id for book in books.filter(language=lang)])
        )
        for aid in lang_filtered_authors:
            if aid not in all_filtered_authors:
                all_filtered_authors.append(aid)

        # by popularity
        logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang))
        dumpjs(
            [
                book.to_array()
                for book in books.where(Book.language == lang).order_by(
                    Book.downloads.desc()
                )
            ],
            "lang_{}_by_popularity.js".format(lang),
        )
        # by title
        logger.info("\t\tDumping lang_{}_by_title.js".format(lang))
        dumpjs(
            [
                book.to_array()
                for book in books.where(Book.language == lang).order_by(
                    Book.title.asc()
                )
            ],
            "lang_{}_by_title.js".format(lang),
        )

        authors = authors_from_ids(lang_filtered_authors)
        logger.info("\t\tDumping authors_lang_{}.js".format(lang))
        dumpjs(
            [author.to_array() for author in authors],
            "authors_lang_{}.js".format(lang),
            "authors_json_data",
        )

    if add_bookshelves:
        bookshelves = bookshelf_list(books)
        for bookshelf in bookshelves:
            # exclude the books with no bookshelf data
            if bookshelf is None:
                continue
            # dumpjs for bookshelf by popularity
            # this will allow the popularity button to use this js on the
            # particular bookshelf page
            logger.info("\t\tDumping bookshelf_{}_by_popularity.js".format(bookshelf))
            dumpjs(
                [
                    book.to_array()
                    for book in books.select()
                    .where(Book.bookshelf == bookshelf)
                    .order_by(Book.downloads.desc())
                ],
                "bookshelf_{}_by_popularity.js".format(bookshelf),
            )

            # by title
            logger.info("\t\tDumping bookshelf_{}_by_title.js".format(bookshelf))
            dumpjs(
                [
                    book.to_array()
                    for book in books.select()
                    .where(Book.bookshelf == bookshelf)
                    .order_by(Book.title.asc())
                ],
                "bookshelf_{}_by_title.js".format(bookshelf),
            )
            # by language
            for lang_name, lang, lang_count in avail_langs:
                logger.info(
                    "\t\tDumping bookshelf_{}_by_lang_{}.js".format(bookshelf, lang)
                )
                dumpjs(
                    [
                        book.to_array()
                        for book in books.select()
                        .where(Book.language == lang)
                        .where(Book.bookshelf == bookshelf)
                        .order_by(Book.downloads.desc())
                    ],
                    "bookshelf_{}_lang_{}_by_popularity.js".format(bookshelf, lang),
                )

                dumpjs(
                    [
                        book.to_array()
                        for book in books.select()
                        .where(Book.language == lang)
                        .where(Book.bookshelf == bookshelf)
                        .order_by(Book.title.asc())
                    ],
                    "bookshelf_{}_lang_{}_by_title.js".format(bookshelf, lang),
                )

        # dump all bookshelves from any given language
        for lang_name, lang, lang_count in avail_langs:
            logger.info("\t\tDumping bookshelves_lang_{}.js".format(lang))
            temp = bookshelf_list_language(books, lang)
            dumpjs(temp, "bookshelves_lang_{}.js".format(lang))

        logger.info("\t\tDumping bookshelves.js")
        dumpjs(bookshelves, "bookshelves.js", "bookshelves_json_data")

        # Create the bookshelf home page
        context = get_default_context(project_id=project_id, books=books)
        context.update({"bookshelf_home": True, "add_bookshelves": True})
        template = jinja_env.get_template("bookshelf_home.html")
        rendered = template.render(**context)
        save_bs_output(
            rendered, os.path.join(static_folder, "bookshelf_home.html"), UTF8
        )

        # add individual bookshelf pages
        for bookshelf in bookshelves:
            if bookshelf is None:
                continue
            context["bookshelf"] = bookshelf
            context.update(
                {
                    "bookshelf_home": False,
                    "individual_book_shelf": True,
                    "no_filters": True,
                    "add_bookshelves": True,
                }
            )
            template = jinja_env.get_template("bookshelf.html")
            rendered = template.render(**context)
            savepath = os.path.join(static_folder, "{}.html".format(bookshelf))
            # logger.info("Saving {} to {}".format(bookshelf, savepath))
            save_bs_output(rendered, savepath, UTF8)

    # author specific collections
    authors = authors_from_ids(all_filtered_authors)
    for author in authors:

        # all_filtered_authors.remove(author.gut_id)
        # by popularity
        logger.info("\t\tDumping auth_{}_by_popularity.js".format(author.gut_id))
        dumpjs(
            [
                book.to_array()
                for book in books.where(Book.author == author).order_by(
                    Book.downloads.desc()
                )
            ],
            "auth_{}_by_popularity.js".format(author.gut_id),
        )
        # by title
        logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id))
        dumpjs(
            [
                book.to_array()
                for book in books.where(Book.author == author).order_by(
                    Book.title.asc()
                )
            ],
            "auth_{}_by_title.js".format(author.gut_id),
        )
        # by language
        for lang_name, lang, lang_count in avail_langs:
            logger.info("\t\tDumping auth_{}_by_lang_{}.js".format(author.gut_id, lang))
            dumpjs(
                [
                    book.to_array()
                    for book in books.where(Book.language == lang)
                    .where(Book.author == author)
                    .order_by(Book.downloads.desc())
                ],
                "auth_{}_lang_{}_by_popularity.js".format(author.gut_id, lang),
            )

            dumpjs(
                [
                    book.to_array()
                    for book in books.where(Book.language == lang)
                    .where(Book.author == author)
                    .order_by(Book.title.asc())
                ],
                "auth_{}_lang_{}_by_title.js".format(author.gut_id, lang),
            )

        # author HTML redirect file
        save_author_file(author, static_folder, books, project_id, force=True)

    # authors list sorted by name
    logger.info("\t\tDumping authors.js")
    dumpjs([author.to_array() for author in authors], "authors.js", "authors_json_data")

    # languages list sorted by code
    logger.info("\t\tDumping languages.js")
    dumpjs(avail_langs, "languages.js", "languages_json_data")

    # languages by weight
    main_languages, other_languages = get_lang_groups(books)
    logger.info("\t\tDumping main_languages.js")
    dumpjs(main_languages, "main_languages.js", "main_languages_json_data")
    dumpjs(other_languages, "other_languages.js", "other_languages_json_data")
Esempio n. 26
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))
Esempio n. 27
0
def build_zimfile(
    static_folder,
    output_folder,
    zim_name=None,
    languages=[],
    formats=[],
    title=None,
    description=None,
    only_books=[],
    create_index=True,
    force=False,
    title_search=False,
    add_bookshelves=False,
):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(
        static_folder=static_folder,
        dev_mode=False,
        languages=languages,
        formats=formats,
        only_books=only_books,
        title_search=title_search,
        add_bookshelves=add_bookshelves,
    )

    if not languages:
        languages = ["mul"]

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = "Project Gutenberg Library"
        else:
            title = "Project Gutenberg Library ({langs})".format(
                langs=",".join(languages)
            )

        if len(formats) < len(FORMAT_MATRIX):
            title += " with {formats}".format(formats=",".join(formats))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_name is None:
        zim_name = "{}.zim".format(project_id)
    zim_path = output_folder.joinpath(zim_name)

    if path(zim_name).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_name))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = [
        "zimwriterfs",
        "--welcome",
        "Home.html",
        "--favicon",
        "favicon.png",
        "--language",
        ",".join(languages),
        "--name",
        project_id,
        "--title",
        title,
        "--description",
        description,
        "--creator",
        "gutenberg.org",
        "--tags",
        "gutenberg",
        "--publisher",
        "Kiwix",
        "--scraper",
        "gutengergtozim-{v}".format(v=VERSION),
        static_folder,
        six.text_type(zim_path),
    ]

    if not create_index:
        cmd.insert(1, "--withoutFTIndex")
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Esempio n. 28
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()
Esempio n. 29
0
def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, 'fixtures', []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))
Esempio n. 30
0
def export_to_json_helpers(books, static_folder, languages,
                           formats, project_id):

    def dumpjs(col, fn, var='json_data'):
        with open(os.path.join(static_folder, fn), 'w') as f:
            f.write("var {var} = ".format(var=var))
            f.write(json.dumps(col))
            f.write(";")
            # json.dump(col, f)

    # all books sorted by popularity
    logger.info("\t\tDumping full_by_popularity.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.downloads.desc())],
           'full_by_popularity.js')

    # all books sorted by title
    logger.info("\t\tDumping full_by_title.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.title.asc())],
           'full_by_title.js')

    avail_langs = get_langs_with_count(books=books)

    all_filtered_authors = []

    # language-specific collections
    for lang_name, lang, lang_count in avail_langs:
        lang_filtered_authors = list(
            set([book.author.gut_id for book in books.filter(language=lang)]))
        for aid in lang_filtered_authors:
            if aid not in all_filtered_authors:
                all_filtered_authors.append(aid)

        # by popularity
        logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.downloads.desc())],
            'lang_{}_by_popularity.js'.format(lang))
        # by title
        logger.info("\t\tDumping lang_{}_by_title.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.title.asc())],
            'lang_{}_by_title.js'.format(lang))

        authors = authors_from_ids(lang_filtered_authors)
        logger.info("\t\tDumping authors_lang_{}.js".format(lang))
        dumpjs([author.to_array() for author in authors],
               'authors_lang_{}.js'.format(lang), 'authors_json_data')

    # author specific collections
    authors = authors_from_ids(all_filtered_authors)
    for author in authors:

        # all_filtered_authors.remove(author.gut_id)
        # by popularity
        logger.info(
            "\t\tDumping auth_{}_by_popularity.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.downloads.desc())],
            'auth_{}_by_popularity.js'.format(author.gut_id))
        # by title
        logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.title.asc())],
            'auth_{}_by_title.js'.format(author.gut_id))
        # by language
        for lang_name, lang, lang_count in avail_langs:
            logger.info("\t\tDumping auth_{}_by_lang_{}.js"
                        .format(author.gut_id, lang))
            dumpjs(
                [book.to_array()
                 for book in books.where(Book.language == lang)
                                  .where(Book.author == author)
                                  .order_by(Book.downloads.desc())],
                'auth_{}_lang_{}_by_popularity.js'.format(author.gut_id, lang))

            dumpjs(
                [book.to_array()
                 for book in books.where(Book.language == lang)
                                  .where(Book.author == author)
                                  .order_by(Book.title.asc())],
                'auth_{}_lang_{}_by_title.js'.format(author.gut_id, lang))

        # author HTML redirect file
        save_author_file(author, static_folder, books, project_id, force=True)

    # authors list sorted by name
    logger.info("\t\tDumping authors.js")
    dumpjs([author.to_array() for author in authors],
           'authors.js', 'authors_json_data')

    # languages list sorted by code
    logger.info("\t\tDumping languages.js")
    dumpjs(avail_langs, 'languages.js', 'languages_json_data')

    # languages by weight
    main_languages, other_languages = get_lang_groups(books)
    logger.info("\t\tDumping main_languages.js")
    dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data')
    dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
Esempio n. 31
0
def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, "fixtures", []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))
Esempio n. 32
0
def handle_unoptimized_files(
    book,
    static_folder,
    src_dir,
    languages,
    formats,
    books,
    project_id,
    optimizer_version,
    force=False,
    title_search=False,
    add_bookshelves=False,
    s3_storage=None,
):
    def copy_file(src, dst):
        logger.info("\t\tCopying {}".format(dst))
        try:
            shutil.copy2(src, dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def update_download_cache(unoptimized_file, optimized_file):
        book_dir = unoptimized_file.parents[1]
        optimized_dir = book_dir.joinpath("optimized")
        unoptimized_dir = book_dir.joinpath("unoptimized")
        if not optimized_dir.exists():
            optimized_dir.mkdir()
        dst = optimized_dir.joinpath(optimized_file.name)
        os.unlink(unoptimized_file)
        copy_file(optimized_file.resolve(), dst.resolve())
        if not [fpath for fpath in unoptimized_dir.iterdir()]:
            unoptimized_dir.rmdir()

    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, _ = html_content_for(book=book, src_dir=src_dir)
    html_book_optimized_files = []
    if html:
        article_fpath = static_folder.joinpath(article_name_for(book))
        if not article_fpath.exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
            save_bs_output(new_html, article_fpath, UTF8)
            html_book_optimized_files.append(article_fpath)
            update_download_cache(
                src_dir.joinpath(fname_for(book, "html")), article_fpath
            )
            if not src_dir.exists():
                return
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def optimize_image(src, dst, force=False):
        if dst.exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if src.suffix == ".png":
            return optimize_png(str(src.resolve()), str(dst.resolve()))
        if src.suffix in (".jpg", ".jpeg"):
            return optimize_jpeg(str(src.resolve()), str(dst.resolve()))
        if src.suffix == ".gif":
            return optimize_gif(str(src.resolve()), str(dst.resolve()))
        return dst

    def optimize_gif(src, dst):
        exec_cmd(["gifsicle", "-O3", src, "-o", dst])

    def optimize_png(src, dst):
        exec_cmd(["pngquant", "--nofs", "--force", "--output", dst, src])
        exec_cmd(["advdef", "-z", "-4", "-i", "5", dst])

    def optimize_jpeg(src, dst):
        if src != dst:
            copy_file(src, dst)
        exec_cmd(["jpegoptim", "--strip-all", "-m50", dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)

    # associated files (images, etc)
    for fpath in src_dir.iterdir():
        if fpath.is_file() and fpath.name.startswith(f"{book.id}_"):
            if fpath.suffix in (".html", ".htm"):
                src = fpath
                dst = static_folder.joinpath(fpath.name)
                if dst.exists() and not force:
                    logger.debug("\t\tSkipping existing HTML {}".format(dst))
                    continue

                logger.info("\t\tExporting HTML file to {}".format(dst))
                html, _ = read_file(src)
                new_html = update_html_for_static(book=book, html_content=html)
                save_bs_output(new_html, dst, UTF8)
                html_book_optimized_files.append(dst)
                update_download_cache(src, dst)
            else:
                try:
                    handle_companion_file(
                        fpath,
                        force=force,
                        html_file_list=html_book_optimized_files,
                        s3_storage=s3_storage,
                        book=book,
                    )
                except Exception as e:
                    logger.exception(e)
                    logger.error(
                        "\t\tException while handling companion file: {}".format(e)
                    )
    if s3_storage and html_book_optimized_files:
        upload_to_cache(
            asset=html_book_optimized_files,
            book_format="html",
            etag=book.html_etag,
            book_id=book.id,
            s3_storage=s3_storage,
            optimizer_version=optimizer_version,
        )

    # other formats
    for format in formats:
        if format not in book.formats() or format == "html":
            continue
        book_file = src_dir.joinpath(fname_for(book, format))
        if book_file.exists():
            try:
                handle_companion_file(
                    book_file,
                    archive_name_for(book, format),
                    force=force,
                    book=book,
                    s3_storage=s3_storage,
                )
            except Exception as e:
                logger.exception(e)
                logger.error(
                    "\t\tException while handling companion file: {}".format(e)
                )
Esempio n. 33
0
def build_zimfile(static_folder,
                  zim_path=None,
                  languages=[],
                  formats=[],
                  title=None,
                  description=None,
                  only_books=[],
                  create_index=True,
                  force=False):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(static_folder=static_folder,
                    dev_mode=False,
                    languages=languages,
                    formats=formats,
                    only_books=only_books)

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}".format(
                formats=",".join(formats)))
        else:
            title = (
                "Project Gutenberg Library ({langs}) with {formats}".format(
                    langs=",".join(languages), formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_path is None:
        zim_path = "{}.zim".format(project_id)

    if path(zim_path).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_path))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = [
        'zimwriterfs', '--welcome', "Home.html", '--favicon', "favicon.png",
        '--language', ','.join(languages), '--name', project_id, '--title',
        title, '--description', description, '--creator', "gutenberg.org",
        '--publisher', "Kiwix", static_folder, zim_path
    ]

    if create_index:
        cmd.insert(1, '--withFullTextIndex')
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Esempio n. 34
0
def download_book(book, download_cache, languages, formats, force, s3_storage,
                  optimizer_version):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
    optimized_dir = book_dir.joinpath("optimized")
    unoptimized_dir = book_dir.joinpath("unoptimized")
    unsuccessful_formats = []
    for book_format in formats:

        unoptimized_fpath = unoptimized_dir.joinpath(
            fname_for(book, book_format))
        optimized_fpath = optimized_dir.joinpath(
            archive_name_for(book, book_format))

        # check if already downloaded
        if (unoptimized_fpath.exists()
                or optimized_fpath.exists()) and not force:
            logger.debug(
                f"\t\t{book_format} already exists for book #{book.id}")
            continue

        if force:
            if book_format == "html":
                for fpath in book_dir.iterdir():
                    if fpath.is_file() and fpath.suffix not in [
                            ".pdf", ".epub"
                    ]:
                        fpath.unlink()
            else:
                if unoptimized_fpath.exists():
                    unoptimized_fpath.unlink()
                if optimized_fpath.exists():
                    optimized_fpath.unlink()
            # delete dirs which are empty
            for dir_name in [optimized_dir, unoptimized_dir]:
                if not dir_name.exists():
                    continue
                if not list(dir_name.iterdir()):
                    dir_name.rmdir()

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if book_format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                unsuccessful_formats.append(book_format)
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(book_format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                book_format, book.id, book.title))
            unsuccessful_formats.append(book_format)
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            book_format, book.id, book.title))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format))))

        import copy

        allurls = copy.copy(urls)
        downloaded_from_cache = False

        while urls:
            url = urls.pop()

            # for development
            # if len(allurls) != 1:
            #     if not resource_exists(url):
            #         continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = unoptimized_dir.joinpath(
                    f"{fname_for(book, book_format)}.zip")

                etag = get_etag_from_url(url)
                if s3_storage:
                    if download_from_cache(
                            book=book,
                            etag=etag,
                            book_format=book_format,
                            dest_dir=optimized_dir,
                            s3_storage=s3_storage,
                            optimizer_version=optimizer_version,
                    ):
                        downloaded_from_cache = True
                        break
                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue
                # save etag
                book.html_etag = etag
                book.save()
                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   dst_dir=unoptimized_dir)
            else:
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")
                        or url.endswith(".epub")):
                    etag = get_etag_from_url(url)
                    if s3_storage:
                        logger.info(
                            f"Trying to download {book.id} from optimization cache"
                        )
                        if download_from_cache(
                                book=book,
                                etag=etag,
                                book_format=book_format,
                                dest_dir=optimized_dir,
                                s3_storage=s3_storage,
                                optimizer_version=optimizer_version,
                        ):
                            downloaded_from_cache = True
                            break
                if not download_file(url, unoptimized_fpath):
                    logger.error(
                        "file donwload failed: {}".format(unoptimized_fpath))
                    continue
                # save etag if html or epub if download is successful
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")):
                    logger.debug(f"Saving html ETag for {book.id}")
                    book.html_etag = etag
                    book.save()
                elif url.endswith(".epub"):
                    logger.debug(f"Saving epub ETag for {book.id}")
                    book.epub_etag = etag
                    book.save()

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()
            # break as we got a working URL
            break

        if not bf.downloaded_from and not downloaded_from_cache:
            logger.error("NO FILE FOR #{}/{}".format(book.id, book_format))
            # delete instance from DB if download failed
            logger.info("Deleting instance from DB")
            bf.delete_instance()
            unsuccessful_formats.append(book_format)
            pp(allurls)

    # delete book from DB if not downloaded in any format
    if len(unsuccessful_formats) == len(formats):
        logger.debug(
            f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..."
        )
        book.delete_instance()
        if book_dir.exists():
            shutil.rmtree(book_dir, ignore_errors=True)
        return
    download_cover(book, book_dir, s3_storage, optimizer_version)