Example #1
0
def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))
Example #2
0
    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(dst)
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub')
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # PDF mostly
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)
Example #3
0
    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)
Example #4
0
    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)
Example #5
0
def download_all_books(url_mirror, download_cache,
                       languages=[], formats=[],
                       only_books=[], force=False):

    available_books = get_list_of_filtered_books(
        languages=languages,
        formats=formats,
        only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info("\tDownloading content files for Book #{id}"
                    .format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}"
                             .format(fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                            '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                            '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                            '8regr10h.zip', '{id}.html.noimages',
                            '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip',
                            '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip',
                            '20004-h.htm', '8indn10h.htm', '8memo10h.zip',
                            'fondu10h.zip', '{id}-h.zip', '8mort10h.zip']
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}"
                             .format(format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}"
                         .format(format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while(urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error("ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath, book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp ; pp(allurls)
                continue
Example #6
0
def export_all_books(static_folder,
                     download_cache,
                     languages=[],
                     formats=[],
                     only_books=[]):

    # ensure dir exist
    path(static_folder).mkdir_p()

    books = get_list_of_filtered_books(languages=languages,
                                       formats=formats,
                                       only_books=only_books)

    sz = len(list(books))
    logger.debug("\tFiltered book collection size: {}".format(sz))

    def nb_by_fmt(fmt):
        return sum([
            1 for book in books
            if BookFormat.select(BookFormat, Book, Format).join(Book).switch(
                BookFormat).join(Format).where(Book.id == book.id).where(
                    Format.mime == FORMAT_MATRIX.get(fmt)).count()
        ])

    logger.debug("\tFiltered book collection, PDF: {}".format(
        nb_by_fmt('pdf')))
    logger.debug("\tFiltered book collection, ePUB: {}".format(
        nb_by_fmt('epub')))
    logger.debug("\tFiltered book collection, HTML: {}".format(
        nb_by_fmt('html')))

    # export to JSON helpers
    export_to_json_helpers(books=books,
                           static_folder=static_folder,
                           languages=languages,
                           formats=formats)

    # copy CSS/JS/* to static_folder
    src_folder = tmpl_path()
    for fname in ('css', 'js', 'jquery', 'favicon.ico', 'favicon.png',
                  'jquery-ui', 'datatables', 'fonts', 'l10n'):
        src = os.path.join(src_folder, fname)
        dst = os.path.join(static_folder, fname)
        if not path(fname).ext:
            path(dst).rmtree_p()
            path(src).copytree(dst)
        else:
            path(src).copyfile(dst)

    # export homepage
    template = jinja_env.get_template('index.html')
    context = get_default_context(books=books)
    context.update({'show_books': True})
    with open(os.path.join(static_folder, 'Home.html'), 'w') as f:
        f.write(template.render(**context).encode('utf-8'))

    # Compute popularity
    popbooks = books.order_by(Book.downloads.desc())
    stars_limits = [0] * NB_POPULARITY_STARS
    stars = NB_POPULARITY_STARS
    nb_downloads = popbooks[0].downloads
    for ibook in range(0, popbooks.count(), 1):
        if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks.count() \
           and popbooks[ibook].downloads < nb_downloads:
            stars_limits[stars - 1] = nb_downloads
            stars = stars - 1
        nb_downloads = popbooks[ibook].downloads

    # export to HTML
    cached_files = os.listdir(download_cache)
    for book in books:
        book.popularity = sum([
            int(book.downloads >= stars_limits[i])
            for i in range(NB_POPULARITY_STARS)
        ])
        export_book_to(book=book,
                       static_folder=static_folder,
                       download_cache=download_cache,
                       cached_files=cached_files,
                       languages=languages,
                       formats=formats,
                       books=books)
Example #7
0
def build_zimfile(static_folder, zim_path=None,
                  languages=[], formats=[],
                  title=None, description=None,
                  only_books=[]):

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}"
                     .format(formats=",".join(formats)))
        else:
            title = ("Project Gutenberg Library ({langs}) with {formats}"
                     .format(langs=",".join(languages),
                             formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    if zim_path is None:
        if len(languages) > 1:
            zim_path = "gutenberg_all_{date}.zim".format(
                    date=datetime.datetime.now().strftime('%m_%Y'))
        else:
            zim_path = "gutenberg_{lang}_all_{date}.zim".format(
                    lang=languages[0],
                    date=datetime.datetime.now().strftime('%Y-%m'))

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    context = {
        'languages': ','.join(languages),
        'title': title,
        'description': description,
        'creator': 'gutenberg.org',
        'publisher': 'Kiwix',

        'home': 'Home.html',
        'favicon': 'favicon.png',

        'static': static_folder,
        'zim': zim_path
    }

    cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" '
           '--language=\\"{languages}\\" --title=\\"{title}\\" '
           '--description=\\"{description}\\" '
           '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"'
           .format(**context))

    logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd)))
    if exec_cmd(cmd):
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Example #8
0
def download_all_books(url_mirror,
                       download_cache,
                       languages=[],
                       formats=[],
                       only_books=[],
                       force=False):

    available_books = get_list_of_filtered_books(languages=languages,
                                                 formats=formats,
                                                 only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info(
            "\tDownloading content files for Book #{id}".format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}".format(
                    fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = [
                    'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                    '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                    '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                    '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm',
                    'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip',
                    '8indn10h.zip', '8resp10h.zip', '20004-h.htm',
                    '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip',
                    '{id}-h.zip', '8mort10h.zip'
                ]
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(
                    mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}".format(
                    format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}".format(
                format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while (urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error(
                            "ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath,
                                       book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp
                pp(allurls)
                continue
Example #9
0
def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, 'fixtures', []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))
Example #10
0
def export_all_books(static_folder,
                     download_cache,
                     languages=[],
                     formats=[],
                     only_books=[]):

    # ensure dir exist
    path(static_folder).mkdir_p()

    books = get_list_of_filtered_books(languages=languages,
                                       formats=formats,
                                       only_books=only_books)

    sz = len(list(books))
    logger.debug("\tFiltered book collection size: {}".format(sz))

    def nb_by_fmt(fmt):
        return sum([1 for book in books
                    if BookFormat.select(BookFormat, Book, Format)
                                 .join(Book).switch(BookFormat)
                                 .join(Format)
                                 .where(Book.id == book.id)
                                 .where(Format.mime == FORMAT_MATRIX.get(fmt))
                                 .count()])

    logger.debug("\tFiltered book collection, PDF: {}"
                 .format(nb_by_fmt('pdf')))
    logger.debug("\tFiltered book collection, ePUB: {}"
                 .format(nb_by_fmt('epub')))
    logger.debug("\tFiltered book collection, HTML: {}"
                 .format(nb_by_fmt('html')))

    # export to JSON helpers
    export_to_json_helpers(books=books,
                           static_folder=static_folder,
                           languages=languages,
                           formats=formats)

    # copy CSS/JS/* to static_folder
    src_folder = tmpl_path()
    for fname in ('css', 'js', 'jquery', 'favicon.ico', 'favicon.png',
                  'jquery-ui', 'datatables', 'fonts', 'l10n'):
        src = os.path.join(src_folder, fname)
        dst = os.path.join(static_folder, fname)
        if not path(fname).ext:
            path(dst).rmtree_p()
            path(src).copytree(dst)
        else:
            path(src).copyfile(dst)

    # export homepage
    template = jinja_env.get_template('index.html')
    context = get_default_context(books=books)
    context.update({'show_books': True})
    with open(os.path.join(static_folder, 'Home.html'), 'w') as f:
        f.write(template.render(**context).encode('utf-8'))

    # Compute popularity
    popbooks = books.order_by(Book.downloads.desc())
    stars_limits = [0] * NB_POPULARITY_STARS
    stars = NB_POPULARITY_STARS
    nb_downloads = popbooks[0].downloads
    for ibook in range(0, popbooks.count(), 1):
        if ibook > float(NB_POPULARITY_STARS-stars+1)/NB_POPULARITY_STARS*popbooks.count() \
           and popbooks[ibook].downloads < nb_downloads:
            stars_limits[stars-1] = nb_downloads
            stars = stars - 1
        nb_downloads = popbooks[ibook].downloads

    # export to HTML
    cached_files = os.listdir(download_cache)
    for book in books:
        book.popularity = sum(
            [int(book.downloads >= stars_limits[i])
             for i in range(NB_POPULARITY_STARS)])
        export_book_to(book=book,
                       static_folder=static_folder,
                       download_cache=download_cache,
                       cached_files=cached_files,
                       languages=languages,
                       formats=formats,
                       books=books)