コード例 #1
0
def html_content_for(book, src_dir):

    html_fpath = src_dir.joinpath(fname_for(book, "html"))

    # is HTML file present?
    if not html_fpath.exists():
        logger.warn("Missing HTML content for #{} at {}".format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
コード例 #2
0
def html_content_for(book, static_folder, download_cache):

    html_fpath = os.path.join(download_cache, fname_for(book, 'html'))

    # is HTML file present?
    if not path(html_fpath).exists():
        logger.warn("Missing HTML content for #{} at {}"
                    .format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
コード例 #3
0
ファイル: export.py プロジェクト: kiwix/gutenberg
def html_content_for(book, static_folder, download_cache):

    html_fpath = os.path.join(download_cache, fname_for(book, 'html'))

    # is HTML file present?
    if not path(html_fpath).exists():
        logger.warn("Missing HTML content for #{} at {}"
                    .format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
コード例 #4
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()
コード例 #5
0
def handle_unoptimized_files(
    book,
    static_folder,
    src_dir,
    languages,
    formats,
    books,
    project_id,
    optimizer_version,
    force=False,
    title_search=False,
    add_bookshelves=False,
    s3_storage=None,
):
    def copy_file(src, dst):
        logger.info("\t\tCopying {}".format(dst))
        try:
            shutil.copy2(src, dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def update_download_cache(unoptimized_file, optimized_file):
        book_dir = unoptimized_file.parents[1]
        optimized_dir = book_dir.joinpath("optimized")
        unoptimized_dir = book_dir.joinpath("unoptimized")
        if not optimized_dir.exists():
            optimized_dir.mkdir()
        dst = optimized_dir.joinpath(optimized_file.name)
        os.unlink(unoptimized_file)
        copy_file(optimized_file.resolve(), dst.resolve())
        if not [fpath for fpath in unoptimized_dir.iterdir()]:
            unoptimized_dir.rmdir()

    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, _ = html_content_for(book=book, src_dir=src_dir)
    html_book_optimized_files = []
    if html:
        article_fpath = static_folder.joinpath(article_name_for(book))
        if not article_fpath.exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
            save_bs_output(new_html, article_fpath, UTF8)
            html_book_optimized_files.append(article_fpath)
            update_download_cache(
                src_dir.joinpath(fname_for(book, "html")), article_fpath
            )
            if not src_dir.exists():
                return
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def optimize_image(src, dst, force=False):
        if dst.exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if src.suffix == ".png":
            return optimize_png(str(src.resolve()), str(dst.resolve()))
        if src.suffix in (".jpg", ".jpeg"):
            return optimize_jpeg(str(src.resolve()), str(dst.resolve()))
        if src.suffix == ".gif":
            return optimize_gif(str(src.resolve()), str(dst.resolve()))
        return dst

    def optimize_gif(src, dst):
        exec_cmd(["gifsicle", "-O3", src, "-o", dst])

    def optimize_png(src, dst):
        exec_cmd(["pngquant", "--nofs", "--force", "--output", dst, src])
        exec_cmd(["advdef", "-z", "-4", "-i", "5", dst])

    def optimize_jpeg(src, dst):
        if src != dst:
            copy_file(src, dst)
        exec_cmd(["jpegoptim", "--strip-all", "-m50", dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)

    # associated files (images, etc)
    for fpath in src_dir.iterdir():
        if fpath.is_file() and fpath.name.startswith(f"{book.id}_"):
            if fpath.suffix in (".html", ".htm"):
                src = fpath
                dst = static_folder.joinpath(fpath.name)
                if dst.exists() and not force:
                    logger.debug("\t\tSkipping existing HTML {}".format(dst))
                    continue

                logger.info("\t\tExporting HTML file to {}".format(dst))
                html, _ = read_file(src)
                new_html = update_html_for_static(book=book, html_content=html)
                save_bs_output(new_html, dst, UTF8)
                html_book_optimized_files.append(dst)
                update_download_cache(src, dst)
            else:
                try:
                    handle_companion_file(
                        fpath,
                        force=force,
                        html_file_list=html_book_optimized_files,
                        s3_storage=s3_storage,
                        book=book,
                    )
                except Exception as e:
                    logger.exception(e)
                    logger.error(
                        "\t\tException while handling companion file: {}".format(e)
                    )
    if s3_storage and html_book_optimized_files:
        upload_to_cache(
            asset=html_book_optimized_files,
            book_format="html",
            etag=book.html_etag,
            book_id=book.id,
            s3_storage=s3_storage,
            optimizer_version=optimizer_version,
        )

    # other formats
    for format in formats:
        if format not in book.formats() or format == "html":
            continue
        book_file = src_dir.joinpath(fname_for(book, format))
        if book_file.exists():
            try:
                handle_companion_file(
                    book_file,
                    archive_name_for(book, format),
                    force=force,
                    book=book,
                    s3_storage=s3_storage,
                )
            except Exception as e:
                logger.exception(e)
                logger.error(
                    "\t\tException while handling companion file: {}".format(e)
                )
コード例 #6
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()
コード例 #7
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))
コード例 #8
0
ファイル: export.py プロジェクト: kiwix/gutenberg
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()
コード例 #9
0
ファイル: export.py プロジェクト: kiwix/gutenberg
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))