Python BookFormat.filter Examples

Programming Language: Python

Namespace/Package Name: gutenbergtozim.database

Class/Type: BookFormat

Method/Function: filter

Examples at hotexamples.com: 2

Python BookFormat.filter - 2 examples found. These are the top rated real world Python examples of gutenbergtozim.database.BookFormat.filter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

select(3)

filter(2)

get_or_create(2)

Example #1

Show file

File: download.py Project: satyamtg/gutenberg

def download_book(book, download_cache, languages, formats, force):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    for format in formats:

        fpath = os.path.join(download_cache, fname_for(book, format))

        # check if already downloaded
        if path(fpath).exists() and not force:
            logger.debug("\t\t{fmt} already exists at {path}".format(
                fmt=format, path=fpath))
            continue

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                format, book.id, book.title).encode("utf-8"))
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            format, book.id, book.title).encode("utf-8"))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

        import copy

        allurls = copy.copy(urls)

        while urls:
            url = urls.pop()

            if len(allurls) != 1:
                if not resource_exists(url):
                    continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = "{}.zip".format(fpath)

                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue

                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   download_cache=download_cache)
            else:
                if not download_file(url, fpath):
                    logger.error("file donwload failed: {}".format(fpath))
                    continue

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()

        if not bf.downloaded_from:
            logger.error("NO FILE FOR #{}/{}".format(book.id, format))
            pp(allurls)
            continue

Example #2

Show file

File: download.py Project: IgorTavcar/gutenberg

def download_book(book, download_cache, languages, formats, force, s3_storage,
                  optimizer_version):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
    optimized_dir = book_dir.joinpath("optimized")
    unoptimized_dir = book_dir.joinpath("unoptimized")
    unsuccessful_formats = []
    for book_format in formats:

        unoptimized_fpath = unoptimized_dir.joinpath(
            fname_for(book, book_format))
        optimized_fpath = optimized_dir.joinpath(
            archive_name_for(book, book_format))

        # check if already downloaded
        if (unoptimized_fpath.exists()
                or optimized_fpath.exists()) and not force:
            logger.debug(
                f"\t\t{book_format} already exists for book #{book.id}")
            continue

        if force:
            if book_format == "html":
                for fpath in book_dir.iterdir():
                    if fpath.is_file() and fpath.suffix not in [
                            ".pdf", ".epub"
                    ]:
                        fpath.unlink()
            else:
                if unoptimized_fpath.exists():
                    unoptimized_fpath.unlink()
                if optimized_fpath.exists():
                    optimized_fpath.unlink()
            # delete dirs which are empty
            for dir_name in [optimized_dir, unoptimized_dir]:
                if not dir_name.exists():
                    continue
                if not list(dir_name.iterdir()):
                    dir_name.rmdir()

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if book_format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                unsuccessful_formats.append(book_format)
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(book_format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                book_format, book.id, book.title))
            unsuccessful_formats.append(book_format)
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            book_format, book.id, book.title))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format))))

        import copy

        allurls = copy.copy(urls)
        downloaded_from_cache = False

        while urls:
            url = urls.pop()

            # for development
            # if len(allurls) != 1:
            #     if not resource_exists(url):
            #         continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = unoptimized_dir.joinpath(
                    f"{fname_for(book, book_format)}.zip")

                etag = get_etag_from_url(url)
                if s3_storage:
                    if download_from_cache(
                            book=book,
                            etag=etag,
                            book_format=book_format,
                            dest_dir=optimized_dir,
                            s3_storage=s3_storage,
                            optimizer_version=optimizer_version,
                    ):
                        downloaded_from_cache = True
                        break
                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue
                # save etag
                book.html_etag = etag
                book.save()
                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   dst_dir=unoptimized_dir)
            else:
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")
                        or url.endswith(".epub")):
                    etag = get_etag_from_url(url)
                    if s3_storage:
                        logger.info(
                            f"Trying to download {book.id} from optimization cache"
                        )
                        if download_from_cache(
                                book=book,
                                etag=etag,
                                book_format=book_format,
                                dest_dir=optimized_dir,
                                s3_storage=s3_storage,
                                optimizer_version=optimizer_version,
                        ):
                            downloaded_from_cache = True
                            break
                if not download_file(url, unoptimized_fpath):
                    logger.error(
                        "file donwload failed: {}".format(unoptimized_fpath))
                    continue
                # save etag if html or epub if download is successful
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")):
                    logger.debug(f"Saving html ETag for {book.id}")
                    book.html_etag = etag
                    book.save()
                elif url.endswith(".epub"):
                    logger.debug(f"Saving epub ETag for {book.id}")
                    book.epub_etag = etag
                    book.save()

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()
            # break as we got a working URL
            break

        if not bf.downloaded_from and not downloaded_from_cache:
            logger.error("NO FILE FOR #{}/{}".format(book.id, book_format))
            # delete instance from DB if download failed
            logger.info("Deleting instance from DB")
            bf.delete_instance()
            unsuccessful_formats.append(book_format)
            pp(allurls)

    # delete book from DB if not downloaded in any format
    if len(unsuccessful_formats) == len(formats):
        logger.debug(
            f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..."
        )
        book.delete_instance()
        if book_dir.exists():
            shutil.rmtree(book_dir, ignore_errors=True)
        return
    download_cover(book, book_dir, s3_storage, optimizer_version)