Esempi in Python per debug, esempi in Python per gutenbergtozim.logger.debug

Esempio n. 1

0

Mostra file

def save_author_file(author, static_folder, books, project_id, force=False):
    fpath = os.path.join(static_folder, "{}.html".format(author.fname()))
    if path(fpath).exists() and not force:
        logger.debug("\t\tSkipping author file {}".format(fpath))
        return
    logger.debug("\t\tSaving author file {}".format(fpath))
    save_file(author_html_content_for(author, books, project_id), fpath, UTF8)

Esempio n. 2

0

Mostra file

File: export.py Progetto: kiwix/gutenberg

def save_author_file(author, static_folder, books, project_id, force=False):
    fpath = os.path.join(static_folder, "{}.html".format(author.fname()))
    if path(fpath).exists() and not force:
        logger.debug("\t\tSkipping author file {}".format(fpath))
        return
    logger.debug("\t\tSaving author file {}".format(fpath))
    save_file(
        author_html_content_for(author, static_folder, books, project_id),
        fpath, UTF8)

Esempio n. 3

0

Mostra file

File: utils.py Progetto: kiwix/gutenberg

def exec_cmd(cmd):
    if isinstance(cmd, (tuple, list)):
        args = cmd
    else:
        args = cmd.split(' ')
    logger.debug(" ".join(args))
    if six.PY3:
        return subprocess.run(args).returncode
    else:
        return subprocess.call(args)

Esempio n. 4

0

Mostra file

def exec_cmd(cmd):
    if isinstance(cmd, (tuple, list)):
        args = cmd
    else:
        args = cmd.split(" ")
    logger.debug(" ".join(args))
    if six.PY3:
        return subprocess.run(args).returncode
    else:
        return subprocess.call(args)

Esempio n. 5

0

Mostra file

File: database.py Progetto: satyamtg/gutenberg

def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat, Url):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))

Esempio n. 6

0

Mostra file

File: database.py Progetto: kiwix/gutenberg

def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat, Url):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))

Esempio n. 7

0

Mostra file

File: download.py Progetto: satyamtg/gutenberg

def download_covers(book, download_cache):
    cover = "{}_cover.jpg".format(book.id)
    fpath = os.path.join(download_cache, cover)
    has_cover = Book.select(Book.cover_page).where(Book.id == book.id)
    if has_cover:
        title = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id,
                                                    book.id)
        logger.debug("Downloading {}".format(title))
        download_file(title, fpath)
    else:
        logger.debug("No Book Cover found for Book #{}".format(book.id))
    return True

Esempio n. 8

0

Mostra file

File: export.py Progetto: satyamtg/gutenberg

    def handle_companion_file(fname,
                              dstfname=None,
                              book=None,
                              force=False,
                              as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info(
                "\t\tCopying and optimizing image companion {}".format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub",
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname,
                                      dstfname,
                                      book,
                                      force,
                                      as_ext="zip")
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

Esempio n. 9

0

Mostra file

File: download.py Progetto: IgorTavcar/gutenberg

def download_cover(book, book_dir, s3_storage, optimizer_version):
    has_cover = Book.select(Book.cover_page).where(Book.id == book.id)
    if has_cover:
        # try to download optimized cover from cache if s3_storage
        url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id)
        etag = get_etag_from_url(url)
        downloaded_from_cache = False
        cover = "{}_cover_image.jpg".format(book.id)
        if (book_dir.joinpath("optimized").joinpath(cover).exists()
                or book_dir.joinpath("unoptimized").joinpath(cover).exists()):
            logger.debug(f"Cover already exists for book #{book.id}")
            return
        if s3_storage:
            logger.info(
                f"Trying to download cover for {book.id} from optimization cache"
            )
            downloaded_from_cache = download_from_cache(
                book=book,
                etag=etag,
                book_format="cover",
                dest_dir=book_dir.joinpath("optimized"),
                s3_storage=s3_storage,
                optimizer_version=optimizer_version,
            )
        if not downloaded_from_cache:
            logger.debug("Downloading {}".format(url))
            if download_file(url,
                             book_dir.joinpath("unoptimized").joinpath(cover)):
                book.cover_etag = etag
                book.save()
    else:
        logger.debug("No Book Cover found for Book #{}".format(book.id))

Esempio n. 10

0

Mostra file

File: export.py Progetto: kiwix/gutenberg

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

Esempio n. 11

0

Mostra file

File: database.py Progetto: satyamtg/gutenberg

def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, "fixtures", []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))

Esempio n. 12

0

Mostra file

File: database.py Progetto: kiwix/gutenberg

def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, 'fixtures', []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))

Esempio n. 13

0

Mostra file

File: download.py Progetto: satyamtg/gutenberg

def download_book(book, download_cache, languages, formats, force):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    for format in formats:

        fpath = os.path.join(download_cache, fname_for(book, format))

        # check if already downloaded
        if path(fpath).exists() and not force:
            logger.debug("\t\t{fmt} already exists at {path}".format(
                fmt=format, path=fpath))
            continue

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                format, book.id, book.title).encode("utf-8"))
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            format, book.id, book.title).encode("utf-8"))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

        import copy

        allurls = copy.copy(urls)

        while urls:
            url = urls.pop()

            if len(allurls) != 1:
                if not resource_exists(url):
                    continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = "{}.zip".format(fpath)

                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue

                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   download_cache=download_cache)
            else:
                if not download_file(url, fpath):
                    logger.error("file donwload failed: {}".format(fpath))
                    continue

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()

        if not bf.downloaded_from:
            logger.error("NO FILE FOR #{}/{}".format(book.id, format))
            pp(allurls)
            continue

Esempio n. 14

0

Mostra file

File: export.py Progetto: kiwix/gutenberg

def export_all_books(static_folder,
                     download_cache,
                     concurrency,
                     languages=[],
                     formats=[],
                     only_books=[],
                     force=False):

    project_id = get_project_id(languages=languages, formats=formats,
                                only_books=only_books)

    # ensure dir exist
    path(static_folder).mkdir_p()

    books = get_list_of_filtered_books(languages=languages,
                                       formats=formats,
                                       only_books=only_books)

    if not len(get_langs_with_count(books=books)):
        critical_error("Unable to proceed. Combination of lamguages, "
                       "books and formats has no result.")

    # sz = len(list(books))
    # logger.debug("\tFiltered book collection size: {}".format(sz))

    def nb_by_fmt(fmt):
        return sum([1 for book in books
                    if BookFormat.select(BookFormat, Book, Format)
                                 .join(Book).switch(BookFormat)
                                 .join(Format)
                                 .where(Book.id == book.id)
                                 .where(Format.mime == FORMAT_MATRIX.get(fmt))
                                 .count()])

    logger.debug("\tFiltered book collection, PDF: {}"
                 .format(nb_by_fmt('pdf')))
    logger.debug("\tFiltered book collection, ePUB: {}"
                 .format(nb_by_fmt('epub')))
    logger.debug("\tFiltered book collection, HTML: {}"
                 .format(nb_by_fmt('html')))

    # export to JSON helpers
    export_to_json_helpers(books=books,
                           static_folder=static_folder,
                           languages=languages,
                           formats=formats,
                           project_id=project_id)

    # export HTML index and other static files
    export_skeleton(static_folder=static_folder, dev_mode=False,
                    languages=languages, formats=formats,
                    only_books=only_books)

    # Compute popularity
    popbooks = books.order_by(Book.downloads.desc())
    popbooks_count = popbooks.count()
    stars_limits = [0] * NB_POPULARITY_STARS
    stars = NB_POPULARITY_STARS
    nb_downloads = popbooks[0].downloads
    for ibook in range(0, popbooks.count(), 1):
        if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks_count \
           and popbooks[ibook].downloads < nb_downloads:
            stars_limits[stars-1] = nb_downloads
            stars = stars - 1
        nb_downloads = popbooks[ibook].downloads

    # export to HTML
    cached_files = os.listdir(download_cache)

    for book in books:
        book.popularity = sum(
            [int(book.downloads >= stars_limits[i])
             for i in range(NB_POPULARITY_STARS)])

    dlb = lambda b: export_book_to(b,
                                   static_folder=static_folder,
                                   download_cache=download_cache,
                                   cached_files=cached_files,
                                   languages=languages,
                                   formats=formats,
                                   books=books,
                                   project_id=project_id,
                                   force=force)
    Pool(concurrency).map(dlb, books)

Esempio n. 15

0

Mostra file

def handle_unoptimized_files(
    book,
    static_folder,
    src_dir,
    languages,
    formats,
    books,
    project_id,
    optimizer_version,
    force=False,
    title_search=False,
    add_bookshelves=False,
    s3_storage=None,
):
    def copy_file(src, dst):
        logger.info("\t\tCopying {}".format(dst))
        try:
            shutil.copy2(src, dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def update_download_cache(unoptimized_file, optimized_file):
        book_dir = unoptimized_file.parents[1]
        optimized_dir = book_dir.joinpath("optimized")
        unoptimized_dir = book_dir.joinpath("unoptimized")
        if not optimized_dir.exists():
            optimized_dir.mkdir()
        dst = optimized_dir.joinpath(optimized_file.name)
        os.unlink(unoptimized_file)
        copy_file(optimized_file.resolve(), dst.resolve())
        if not [fpath for fpath in unoptimized_dir.iterdir()]:
            unoptimized_dir.rmdir()

    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, _ = html_content_for(book=book, src_dir=src_dir)
    html_book_optimized_files = []
    if html:
        article_fpath = static_folder.joinpath(article_name_for(book))
        if not article_fpath.exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
            save_bs_output(new_html, article_fpath, UTF8)
            html_book_optimized_files.append(article_fpath)
            update_download_cache(
                src_dir.joinpath(fname_for(book, "html")), article_fpath
            )
            if not src_dir.exists():
                return
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def optimize_image(src, dst, force=False):
        if dst.exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if src.suffix == ".png":
            return optimize_png(str(src.resolve()), str(dst.resolve()))
        if src.suffix in (".jpg", ".jpeg"):
            return optimize_jpeg(str(src.resolve()), str(dst.resolve()))
        if src.suffix == ".gif":
            return optimize_gif(str(src.resolve()), str(dst.resolve()))
        return dst

    def optimize_gif(src, dst):
        exec_cmd(["gifsicle", "-O3", src, "-o", dst])

    def optimize_png(src, dst):
        exec_cmd(["pngquant", "--nofs", "--force", "--output", dst, src])
        exec_cmd(["advdef", "-z", "-4", "-i", "5", dst])

    def optimize_jpeg(src, dst):
        if src != dst:
            copy_file(src, dst)
        exec_cmd(["jpegoptim", "--strip-all", "-m50", dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)

    # associated files (images, etc)
    for fpath in src_dir.iterdir():
        if fpath.is_file() and fpath.name.startswith(f"{book.id}_"):
            if fpath.suffix in (".html", ".htm"):
                src = fpath
                dst = static_folder.joinpath(fpath.name)
                if dst.exists() and not force:
                    logger.debug("\t\tSkipping existing HTML {}".format(dst))
                    continue

                logger.info("\t\tExporting HTML file to {}".format(dst))
                html, _ = read_file(src)
                new_html = update_html_for_static(book=book, html_content=html)
                save_bs_output(new_html, dst, UTF8)
                html_book_optimized_files.append(dst)
                update_download_cache(src, dst)
            else:
                try:
                    handle_companion_file(
                        fpath,
                        force=force,
                        html_file_list=html_book_optimized_files,
                        s3_storage=s3_storage,
                        book=book,
                    )
                except Exception as e:
                    logger.exception(e)
                    logger.error(
                        "\t\tException while handling companion file: {}".format(e)
                    )
    if s3_storage and html_book_optimized_files:
        upload_to_cache(
            asset=html_book_optimized_files,
            book_format="html",
            etag=book.html_etag,
            book_id=book.id,
            s3_storage=s3_storage,
            optimizer_version=optimizer_version,
        )

    # other formats
    for format in formats:
        if format not in book.formats() or format == "html":
            continue
        book_file = src_dir.joinpath(fname_for(book, format))
        if book_file.exists():
            try:
                handle_companion_file(
                    book_file,
                    archive_name_for(book, format),
                    force=force,
                    book=book,
                    s3_storage=s3_storage,
                )
            except Exception as e:
                logger.exception(e)
                logger.error(
                    "\t\tException while handling companion file: {}".format(e)
                )

Esempio n. 16

0

Mostra file

def export_all_books(
    static_folder=None,
    download_cache=None,
    concurrency=None,
    languages=[],
    formats=[],
    only_books=[],
    force=False,
    title_search=False,
    add_bookshelves=False,
    s3_storage=None,
    optimizer_version=None,
):

    project_id = get_project_id(
        languages=languages, formats=formats, only_books=only_books
    )

    # ensure dir exist
    path(static_folder).mkdir_p()

    books = get_list_of_filtered_books(
        languages=languages, formats=formats, only_books=only_books
    )

    if not len(get_langs_with_count(books=books)):
        critical_error(
            "Unable to proceed. Combination of lamguages, "
            "books and formats has no result."
        )

    # sz = len(list(books))
    # logger.debug("\tFiltered book collection size: {}".format(sz))

    def nb_by_fmt(fmt):
        return sum(
            [
                1
                for book in books
                if BookFormat.select(BookFormat, Book, Format)
                .join(Book)
                .switch(BookFormat)
                .join(Format)
                .where(Book.id == book.id)
                .where(Format.mime == FORMAT_MATRIX.get(fmt))
                .count()
            ]
        )

    logger.debug("\tFiltered book collection, PDF: {}".format(nb_by_fmt("pdf")))
    logger.debug("\tFiltered book collection, ePUB: {}".format(nb_by_fmt("epub")))
    logger.debug("\tFiltered book collection, HTML: {}".format(nb_by_fmt("html")))

    # export to JSON helpers
    export_to_json_helpers(
        books=books,
        static_folder=static_folder,
        languages=languages,
        formats=formats,
        project_id=project_id,
        title_search=title_search,
        add_bookshelves=add_bookshelves,
    )

    # export HTML index and other static files
    export_skeleton(
        static_folder=static_folder,
        dev_mode=False,
        languages=languages,
        formats=formats,
        only_books=only_books,
        title_search=title_search,
        add_bookshelves=add_bookshelves,
    )

    # Compute popularity
    popbooks = books.order_by(Book.downloads.desc())
    popbooks_count = popbooks.count()
    stars_limits = [0] * NB_POPULARITY_STARS
    stars = NB_POPULARITY_STARS
    nb_downloads = popbooks[0].downloads
    for ibook in range(0, popbooks.count(), 1):
        if (
            ibook
            > float(NB_POPULARITY_STARS - stars + 1)
            / NB_POPULARITY_STARS
            * popbooks_count
            and popbooks[ibook].downloads < nb_downloads
        ):
            stars_limits[stars - 1] = nb_downloads
            stars = stars - 1
        nb_downloads = popbooks[ibook].downloads

    for book in books:
        book.popularity = sum(
            [int(book.downloads >= stars_limits[i]) for i in range(NB_POPULARITY_STARS)]
        )

    def dlb(b):
        return export_book(
            b,
            static_folder=pathlib.Path(static_folder),
            book_dir=pathlib.Path(download_cache).joinpath(str(b.id)),
            languages=languages,
            formats=formats,
            books=books,
            project_id=project_id,
            force=force,
            title_search=title_search,
            add_bookshelves=add_bookshelves,
            s3_storage=s3_storage,
            optimizer_version=optimizer_version,
        )

    Pool(concurrency).map(dlb, books)

Esempio n. 17

0

Mostra file

def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))

Esempio n. 18

0

Mostra file

File: export.py Progetto: kiwix/gutenberg

def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))

Esempio n. 19

0

Mostra file

File: download.py Progetto: IgorTavcar/gutenberg

def download_book(book, download_cache, languages, formats, force, s3_storage,
                  optimizer_version):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
    optimized_dir = book_dir.joinpath("optimized")
    unoptimized_dir = book_dir.joinpath("unoptimized")
    unsuccessful_formats = []
    for book_format in formats:

        unoptimized_fpath = unoptimized_dir.joinpath(
            fname_for(book, book_format))
        optimized_fpath = optimized_dir.joinpath(
            archive_name_for(book, book_format))

        # check if already downloaded
        if (unoptimized_fpath.exists()
                or optimized_fpath.exists()) and not force:
            logger.debug(
                f"\t\t{book_format} already exists for book #{book.id}")
            continue

        if force:
            if book_format == "html":
                for fpath in book_dir.iterdir():
                    if fpath.is_file() and fpath.suffix not in [
                            ".pdf", ".epub"
                    ]:
                        fpath.unlink()
            else:
                if unoptimized_fpath.exists():
                    unoptimized_fpath.unlink()
                if optimized_fpath.exists():
                    optimized_fpath.unlink()
            # delete dirs which are empty
            for dir_name in [optimized_dir, unoptimized_dir]:
                if not dir_name.exists():
                    continue
                if not list(dir_name.iterdir()):
                    dir_name.rmdir()

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if book_format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                unsuccessful_formats.append(book_format)
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(book_format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                book_format, book.id, book.title))
            unsuccessful_formats.append(book_format)
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            book_format, book.id, book.title))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format))))

        import copy

        allurls = copy.copy(urls)
        downloaded_from_cache = False

        while urls:
            url = urls.pop()

            # for development
            # if len(allurls) != 1:
            #     if not resource_exists(url):
            #         continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = unoptimized_dir.joinpath(
                    f"{fname_for(book, book_format)}.zip")

                etag = get_etag_from_url(url)
                if s3_storage:
                    if download_from_cache(
                            book=book,
                            etag=etag,
                            book_format=book_format,
                            dest_dir=optimized_dir,
                            s3_storage=s3_storage,
                            optimizer_version=optimizer_version,
                    ):
                        downloaded_from_cache = True
                        break
                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue
                # save etag
                book.html_etag = etag
                book.save()
                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   dst_dir=unoptimized_dir)
            else:
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")
                        or url.endswith(".epub")):
                    etag = get_etag_from_url(url)
                    if s3_storage:
                        logger.info(
                            f"Trying to download {book.id} from optimization cache"
                        )
                        if download_from_cache(
                                book=book,
                                etag=etag,
                                book_format=book_format,
                                dest_dir=optimized_dir,
                                s3_storage=s3_storage,
                                optimizer_version=optimizer_version,
                        ):
                            downloaded_from_cache = True
                            break
                if not download_file(url, unoptimized_fpath):
                    logger.error(
                        "file donwload failed: {}".format(unoptimized_fpath))
                    continue
                # save etag if html or epub if download is successful
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")):
                    logger.debug(f"Saving html ETag for {book.id}")
                    book.html_etag = etag
                    book.save()
                elif url.endswith(".epub"):
                    logger.debug(f"Saving epub ETag for {book.id}")
                    book.epub_etag = etag
                    book.save()

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()
            # break as we got a working URL
            break

        if not bf.downloaded_from and not downloaded_from_cache:
            logger.error("NO FILE FOR #{}/{}".format(book.id, book_format))
            # delete instance from DB if download failed
            logger.info("Deleting instance from DB")
            bf.delete_instance()
            unsuccessful_formats.append(book_format)
            pp(allurls)

    # delete book from DB if not downloaded in any format
    if len(unsuccessful_formats) == len(formats):
        logger.debug(
            f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..."
        )
        book.delete_instance()
        if book_dir.exists():
            shutil.rmtree(book_dir, ignore_errors=True)
        return
    download_cover(book, book_dir, s3_storage, optimizer_version)

Esempio n. 20

0

Mostra file

def export_all_books(static_folder,
                     download_cache,
                     concurrency,
                     languages=[],
                     formats=[],
                     only_books=[],
                     force=False):

    project_id = get_project_id(languages=languages,
                                formats=formats,
                                only_books=only_books)

    # ensure dir exist
    path(static_folder).mkdir_p()

    books = get_list_of_filtered_books(languages=languages,
                                       formats=formats,
                                       only_books=only_books)

    if not len(get_langs_with_count(books=books)):
        critical_error("Unable to proceed. Combination of lamguages, "
                       "books and formats has no result.")

    # sz = len(list(books))
    # logger.debug("\tFiltered book collection size: {}".format(sz))

    def nb_by_fmt(fmt):
        return sum([
            1 for book in books
            if BookFormat.select(BookFormat, Book, Format).join(Book).switch(
                BookFormat).join(Format).where(Book.id == book.id).where(
                    Format.mime == FORMAT_MATRIX.get(fmt)).count()
        ])

    logger.debug("\tFiltered book collection, PDF: {}".format(
        nb_by_fmt('pdf')))
    logger.debug("\tFiltered book collection, ePUB: {}".format(
        nb_by_fmt('epub')))
    logger.debug("\tFiltered book collection, HTML: {}".format(
        nb_by_fmt('html')))

    # export to JSON helpers
    export_to_json_helpers(books=books,
                           static_folder=static_folder,
                           languages=languages,
                           formats=formats,
                           project_id=project_id)

    # export HTML index and other static files
    export_skeleton(static_folder=static_folder,
                    dev_mode=False,
                    languages=languages,
                    formats=formats,
                    only_books=only_books)

    # Compute popularity
    popbooks = books.order_by(Book.downloads.desc())
    popbooks_count = popbooks.count()
    stars_limits = [0] * NB_POPULARITY_STARS
    stars = NB_POPULARITY_STARS
    nb_downloads = popbooks[0].downloads
    for ibook in range(0, popbooks.count(), 1):
        if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks_count \
           and popbooks[ibook].downloads < nb_downloads:
            stars_limits[stars - 1] = nb_downloads
            stars = stars - 1
        nb_downloads = popbooks[ibook].downloads

    # export to HTML
    cached_files = os.listdir(download_cache)

    for book in books:
        book.popularity = sum([
            int(book.downloads >= stars_limits[i])
            for i in range(NB_POPULARITY_STARS)
        ])

    dlb = lambda b: export_book_to(b,
                                   static_folder=static_folder,
                                   download_cache=download_cache,
                                   cached_files=cached_files,
                                   languages=languages,
                                   formats=formats,
                                   books=books,
                                   project_id=project_id,
                                   force=force)
    Pool(concurrency).map(dlb, books)

Esempio n. 21

0

Mostra file

    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)