Example #1
0
def download_rdf_file(rdf_url):
    fname = 'rdf-files.tar.bz2'

    if path(fname).exists():
        logger.info("\tdf-files.tar.bz2 already exists in {}".format(fname))
        return fname

    logger.info("\tDownloading {} into {}".format(rdf_url, fname))
    download_file(rdf_url, fname)

    return fname
Example #2
0
def setup_database(wipe=False):
    logger.info("Setting up the database")

    for model in (License, Format, Author, Book, BookFormat):
        if wipe:
            model.drop_table(fail_silently=True)
        if not model.table_exists():
            model.create_table()
            logger.debug("Created table for {}".format(model._meta.name))
            load_fixtures(model)
        else:
            logger.debug("{} table already exists.".format(model._meta.name))
Example #3
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
Example #4
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
Example #5
0
def extract_rdf_files(rdf_tarball, rdf_path):
    if path(rdf_path).exists():
        logger.info("\tRDF-files folder already exists in {}".format(rdf_path))
        return

    logger.info("\tExtracting {} into {}".format(rdf_tarball, rdf_path))

    # create destdir if not exists
    dest = path(rdf_path)
    dest.mkdir_p()

    cmd = "tar -C {dest} --strip-components 2 -x -f {tarb}".format(
        dest=rdf_path, tarb=rdf_tarball)
    exec_cmd(cmd)
    return
Example #6
0
def parse_and_fill(rdf_path, only_books=[]):
    logger.info("\tLooping throught RDF files in {}".format(rdf_path))

    for root, dirs, files in os.walk(rdf_path):
        if root.endswith('999999'):
            continue

        # skip books outside of requsted list
        if len(only_books) and path(root).basename() not in \
            [str(bid) for bid in only_books]:
            continue

        for fname in files:
            if fname in ('.', '..', 'pg0.rdf'):
                continue

            if not fname.endswith('.rdf'):
                continue

            fpath = os.path.join(root, fname)
            parse_and_process_file(fpath)
Example #7
0
def parse_and_process_file(rdf_file):
    logger.info("\tParsing file {}".format(rdf_file))
    if not path(rdf_file).exists():
        raise ValueError(rdf_file)

    gid = re.match(r'.*/pg([0-9]+).rdf', rdf_file).groups()[0]

    with open(rdf_file, 'r') as f:
        parser = RdfParser(f.read(), gid).parse()

    if parser.license == 'None':
        logger.info("\tWARN: Unusable book without any information {}".format(gid))
    elif parser.title == '':
        logger.info("\tWARN: Unusable book without title {}".format(gid))
    else:
        save_rdf_in_database(parser)
Example #8
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with (open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'.format(
                dst=path_for_cmd(dst),
                files=" ".join(
                    [f for f in zipped_files if not f == 'mimetype'])))

        path(tmpd).rmtree_p()
Example #9
0
def export_book_to(book, static_folder, download_cache, cached_files,
                   languages, formats, books):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html = html_content_for(book=book,
                            static_folder=static_folder,
                            download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        logger.info("\t\tExporting to {}".format(article_fpath))
        try:
            new_html = update_html_for_static(book=book, html_content=html)
        except:
            new_html = html
        with open(article_fpath, 'w') as f:
            f.write(new_html)

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(fpath):
        if path(fpath).ext == '.png':
            return optimize_png(fpath)
        if path(fpath).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(fpath)
        if path(fpath).ext == '.gif':
            return optimize_gif(fpath)
        return fpath

    def optimize_gif(fpath):
        exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath))

    def optimize_png(fpath):
        pngquant = 'pngquant --nofs --force --ext=".png" "{path}"'
        advdef = 'advdef -z -4 -i 5 "{path}"'
        exec_cmd(pngquant.format(path=fpath))
        exec_cmd(advdef.format(path=fpath))

    def optimize_jpeg(fpath):
        exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath))

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with (open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'.format(
                dst=path_for_cmd(dst),
                files=" ".join(
                    [f for f in zipped_files if not f == 'mimetype'])))

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [
            fn for fn in cached_files if fn.startswith("{}_".format(book.id))
    ]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html = "CAN'T READ FILE"
            with open(src, 'r') as f:
                html = f.read()
            new_html = update_html_for_static(book=book, html_content=html)
            with open(dst, 'w') as f:
                f.write(new_html)
        else:
            logger.info("\t\tCopying companion file to {}".format(fname))
            try:
                handle_companion_file(fname)
            except Exception as e:
                logger.error(
                    "\t\tException while handling companion file: {}".format(
                        e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        logger.info("\t\tCopying format file to {}".format(
            archive_name_for(book, format)))
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format))
        except Exception as e:
            logger.error(
                "\t\tException while handling companion file: {}".format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    logger.info("\t\tExporting to {}".format(cover_fpath))
    html = cover_html_content_for(book=book,
                                  static_folder=static_folder,
                                  books=books)
    with open(cover_fpath, 'w') as f:
        f.write(html.encode('utf-8'))
Example #10
0
def main(arguments):

    # actions constants
    DO_PREPARE = arguments.get('--prepare', False)
    DO_PARSE = arguments.get('--parse', False)
    DO_DOWNLOAD = arguments.get('--download', False)
    DO_EXPORT = arguments.get('--export', False)
    DO_ZIM = arguments.get('--zim', False)
    DO_CHECKDEPS = arguments.get('--check', False)
    COMPLETE_DUMP = arguments.get('--complete', False)

    URL_MIRROR = arguments.get('--mirror') or 'http://zimfarm.kiwix.org/gutenberg'
    RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
    STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
    ZIM_FILE = arguments.get('--zim-file')
    WIPE_DB = not arguments.get('--keep-db') or False
    RDF_URL = arguments.get('--rdf-url') or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
    BOOKS = arguments.get('--books') or []
    ZTITLE = arguments.get('--zim-title')
    ZDESC = arguments.get('--zim-desc')

    LANGUAGES = [x.strip().lower()
                 for x in (arguments.get('--languages') or '').split(',')
                 if x.strip()]
    # special shortcuts for "all"
    if arguments.get('--formats') in ['all', None]:
        FORMATS = ['epub', 'pdf']
    else:
        FORMATS = [x.strip().lower()
                   for x in (arguments.get('--formats') or '').split(',')
                   if x.strip()]

    try:
        BOOKS = [bid for bid in BOOKS.split(',')]
        f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()])
        books = []
        for i in BOOKS:
            blst = f(i)
            if len(blst) > 1:
                blst = range(blst[0], blst[1]+1)
            books.extend(blst)
        BOOKS = list(set(books))
    except Exception as e:
        logger.error(e)
        BOOKS = []

    # no arguments, default to --complete
    if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM):
        COMPLETE_DUMP = True

    if COMPLETE_DUMP:
        DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \
            DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True

    if DO_CHECKDEPS:
        logger.info("CHECKING for dependencies on the system")
        if not check_dependencies()[0]:
            logger.error("Exiting...")
            sys.exit(1)

    if DO_PREPARE:
        logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)

    if DO_PARSE:
        logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
        setup_database(wipe=WIPE_DB)
        parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS)

    if DO_DOWNLOAD:
        logger.info("DOWNLOADING ebooks from mirror using filters")
        download_all_books(url_mirror=URL_MIRROR,
                           download_cache=DL_CACHE,
                           languages=LANGUAGES,
                           formats=FORMATS,
                           only_books=BOOKS)

    if DO_EXPORT:
        logger.info("EXPORTING ebooks to static folder (and JSON)")
        export_all_books(static_folder=STATIC_FOLDER,
                         download_cache=DL_CACHE,
                         languages=LANGUAGES,
                         formats=FORMATS,
                         only_books=BOOKS)

    if DO_ZIM:
        if not check_dependencies()[1]:
            logger.error("You don't have zimwriterfs installed.")
            sys.exit(1)
        logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER))
        build_zimfile(static_folder=STATIC_FOLDER, zim_path=ZIM_FILE,
                      languages=LANGUAGES, formats=FORMATS,
                      only_books=BOOKS,
                      title=ZTITLE, description=ZDESC)
Example #11
0
def build_zimfile(static_folder, zim_path=None,
                  languages=[], formats=[],
                  title=None, description=None,
                  only_books=[]):

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}"
                     .format(formats=",".join(formats)))
        else:
            title = ("Project Gutenberg Library ({langs}) with {formats}"
                     .format(langs=",".join(languages),
                             formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    if zim_path is None:
        if len(languages) > 1:
            zim_path = "gutenberg_all_{date}.zim".format(
                    date=datetime.datetime.now().strftime('%m_%Y'))
        else:
            zim_path = "gutenberg_{lang}_all_{date}.zim".format(
                    lang=languages[0],
                    date=datetime.datetime.now().strftime('%Y-%m'))

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    context = {
        'languages': ','.join(languages),
        'title': title,
        'description': description,
        'creator': 'gutenberg.org',
        'publisher': 'Kiwix',

        'home': 'Home.html',
        'favicon': 'favicon.png',

        'static': static_folder,
        'zim': zim_path
    }

    cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" '
           '--language=\\"{languages}\\" --title=\\"{title}\\" '
           '--description=\\"{description}\\" '
           '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"'
           .format(**context))

    logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd)))
    if exec_cmd(cmd):
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Example #12
0
def download_all_books(url_mirror,
                       download_cache,
                       languages=[],
                       formats=[],
                       only_books=[],
                       force=False):

    available_books = get_list_of_filtered_books(languages=languages,
                                                 formats=formats,
                                                 only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info(
            "\tDownloading content files for Book #{id}".format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}".format(
                    fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = [
                    'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                    '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                    '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                    '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm',
                    'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip',
                    '8indn10h.zip', '8resp10h.zip', '20004-h.htm',
                    '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip',
                    '{id}-h.zip', '8mort10h.zip'
                ]
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(
                    mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}".format(
                    format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}".format(
                format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while (urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error(
                            "ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath,
                                       book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp
                pp(allurls)
                continue
Example #13
0
def load_fixtures(model):
    logger.info("Loading fixtures for {}".format(model._meta.name))

    for fixture in getattr(model._meta, 'fixtures', []):
        f = model.create(**fixture)
        logger.debug("[fixtures] Created {}".format(f))
Example #14
0
def export_to_json_helpers(books, static_folder, languages, formats):

    def dumpjs(col, fn, var='json_data'):
        with open(os.path.join(static_folder, fn), 'w') as f:
            f.write("var {var} = ".format(var=var))
            f.write(json.dumps(col))
            f.write(";")
            # json.dump(col, f)

    # all books sorted by popularity
    logger.info("\t\tDumping full_by_popularity.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.downloads.desc())],
           'full_by_popularity.js')

    # all books sorted by title
    logger.info("\t\tDumping full_by_title.js")
    dumpjs([book.to_array()
            for book in books.order_by(Book.title.asc())],
           'full_by_title.js')

    avail_langs = get_langs_with_count(books=books)

    all_filtered_authors = []

    # language-specific collections
    for lang_name, lang, lang_count in avail_langs:
        lang_filtered_authors = list(
            set([book.author.gut_id for book in books.filter(language=lang)]))
        for aid in lang_filtered_authors:
            if aid not in all_filtered_authors:
                all_filtered_authors.append(aid)

        # by popularity
        logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.downloads.desc())],
            'lang_{}_by_popularity.js'.format(lang))
        # by title
        logger.info("\t\tDumping lang_{}_by_title.js".format(lang))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.language == lang)
                              .order_by(Book.title.asc())],
            'lang_{}_by_title.js'.format(lang))

        authors = authors_from_ids(lang_filtered_authors)
        logger.info("\t\tDumping authors_lang_{}.js".format(lang))
        dumpjs([author.to_array() for author in authors],
               'authors_lang_{}.js'.format(lang), 'authors_json_data')

    # author specific collections
    authors = authors_from_ids(all_filtered_authors)
    for author in authors:

        # all_filtered_authors.remove(author.gut_id)
        # by popularity
        logger.info(
            "\t\tDumping auth_{}_by_popularity.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.downloads.desc())],
            'auth_{}_by_popularity.js'.format(author.gut_id))
        # by title
        logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id))
        dumpjs(
            [book.to_array()
             for book in books.where(Book.author == author)
                              .order_by(Book.title.asc())],
            'auth_{}_by_title.js'.format(author.gut_id))

    # authors list sorted by name
    logger.info("\t\tDumping authors.js")
    dumpjs([author.to_array() for author in authors],
           'authors.js', 'authors_json_data')

    # languages list sorted by code
    logger.info("\t\tDumping languages.js")
    dumpjs(avail_langs, 'languages.js', 'languages_json_data')

    # languages by weight
    main_languages, other_languages = get_lang_groups(books)
    logger.info("\t\tDumping main_languages.js")
    dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data')
    dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
Example #15
0
    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with(open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'
                     .format(dst=path_for_cmd(dst),
                             files=" ".join([f for f in zipped_files
                                             if not f == 'mimetype'])))

        path(tmpd).rmtree_p()
Example #16
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html = html_content_for(book=book,
                            static_folder=static_folder,
                            download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        logger.info("\t\tExporting to {}".format(article_fpath))
        try:
            new_html = update_html_for_static(book=book, html_content=html)
        except:
            new_html = html
        with open(article_fpath, 'w') as f:
            f.write(new_html)

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(fpath):
        if path(fpath).ext == '.png':
            return optimize_png(fpath)
        if path(fpath).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(fpath)
        if path(fpath).ext == '.gif':
            return optimize_gif(fpath)
        return fpath

    def optimize_gif(fpath):
        exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath))

    def optimize_png(fpath):
        pngquant = 'pngquant --nofs --force --ext=".png" "{path}"'
        advdef = 'advdef -z -4 -i 5 "{path}"'
        exec_cmd(pngquant.format(path=fpath))
        exec_cmd(advdef.format(path=fpath))

    def optimize_jpeg(fpath):
        exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath))

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with(open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'
                     .format(dst=path_for_cmd(dst),
                             files=" ".join([f for f in zipped_files
                                             if not f == 'mimetype'])))

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html = "CAN'T READ FILE"
            with open(src, 'r') as f:
                html = f.read()
            new_html = update_html_for_static(book=book, html_content=html)
            with open(dst, 'w') as f:
                f.write(new_html)
        else:
            logger.info("\t\tCopying companion file to {}".format(fname))
            try:
                handle_companion_file(fname)
            except Exception as e:
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        logger.info("\t\tCopying format file to {}"
                    .format(archive_name_for(book, format)))
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format))
        except Exception as e:
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    logger.info("\t\tExporting to {}".format(cover_fpath))
    html = cover_html_content_for(book=book,
                                  static_folder=static_folder,
                                  books=books)
    with open(cover_fpath, 'w') as f:
        f.write(html.encode('utf-8'))
Example #17
0
def export_to_json_helpers(books, static_folder, languages, formats):
    def dumpjs(col, fn, var='json_data'):
        with open(os.path.join(static_folder, fn), 'w') as f:
            f.write("var {var} = ".format(var=var))
            f.write(json.dumps(col))
            f.write(";")
            # json.dump(col, f)

    # all books sorted by popularity
    logger.info("\t\tDumping full_by_popularity.js")
    dumpjs([book.to_array() for book in books.order_by(Book.downloads.desc())],
           'full_by_popularity.js')

    # all books sorted by title
    logger.info("\t\tDumping full_by_title.js")
    dumpjs([book.to_array() for book in books.order_by(Book.title.asc())],
           'full_by_title.js')

    avail_langs = get_langs_with_count(books=books)

    all_filtered_authors = []

    # language-specific collections
    for lang_name, lang, lang_count in avail_langs:
        lang_filtered_authors = list(
            set([book.author.gut_id for book in books.filter(language=lang)]))
        for aid in lang_filtered_authors:
            if aid not in all_filtered_authors:
                all_filtered_authors.append(aid)

        # by popularity
        logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang))
        dumpjs([
            book.to_array() for book in books.where(
                Book.language == lang).order_by(Book.downloads.desc())
        ], 'lang_{}_by_popularity.js'.format(lang))
        # by title
        logger.info("\t\tDumping lang_{}_by_title.js".format(lang))
        dumpjs([
            book.to_array() for book in books.where(
                Book.language == lang).order_by(Book.title.asc())
        ], 'lang_{}_by_title.js'.format(lang))

        authors = authors_from_ids(lang_filtered_authors)
        logger.info("\t\tDumping authors_lang_{}.js".format(lang))
        dumpjs([author.to_array() for author in authors],
               'authors_lang_{}.js'.format(lang), 'authors_json_data')

    # author specific collections
    authors = authors_from_ids(all_filtered_authors)
    for author in authors:

        # all_filtered_authors.remove(author.gut_id)
        # by popularity
        logger.info("\t\tDumping auth_{}_by_popularity.js".format(
            author.gut_id))
        dumpjs([
            book.to_array() for book in books.where(
                Book.author == author).order_by(Book.downloads.desc())
        ], 'auth_{}_by_popularity.js'.format(author.gut_id))
        # by title
        logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id))
        dumpjs([
            book.to_array() for book in books.where(
                Book.author == author).order_by(Book.title.asc())
        ], 'auth_{}_by_title.js'.format(author.gut_id))

    # authors list sorted by name
    logger.info("\t\tDumping authors.js")
    dumpjs([author.to_array() for author in authors], 'authors.js',
           'authors_json_data')

    # languages list sorted by code
    logger.info("\t\tDumping languages.js")
    dumpjs(avail_langs, 'languages.js', 'languages_json_data')

    # languages by weight
    main_languages, other_languages = get_lang_groups(books)
    logger.info("\t\tDumping main_languages.js")
    dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data')
    dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
Example #18
0
def download_all_books(url_mirror, download_cache,
                       languages=[], formats=[],
                       only_books=[], force=False):

    available_books = get_list_of_filtered_books(
        languages=languages,
        formats=formats,
        only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info("\tDownloading content files for Book #{id}"
                    .format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}"
                             .format(fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                            '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                            '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                            '8regr10h.zip', '{id}.html.noimages',
                            '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip',
                            '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip',
                            '20004-h.htm', '8indn10h.htm', '8memo10h.zip',
                            'fondu10h.zip', '{id}-h.zip', '8mort10h.zip']
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}"
                             .format(format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}"
                         .format(format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while(urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error("ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath, book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp ; pp(allurls)
                continue
Example #19
0
def main(arguments):

    # actions constants
    DO_PREPARE = arguments.get('--prepare', False)
    DO_PARSE = arguments.get('--parse', False)
    DO_DOWNLOAD = arguments.get('--download', False)
    DO_EXPORT = arguments.get('--export', False)
    DO_ZIM = arguments.get('--zim', False)
    DO_CHECKDEPS = arguments.get('--check', False)
    COMPLETE_DUMP = arguments.get('--complete', False)

    URL_MIRROR = arguments.get(
        '--mirror') or 'http://zimfarm.kiwix.org/gutenberg'
    RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
    STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
    ZIM_FILE = arguments.get('--zim-file')
    WIPE_DB = not arguments.get('--keep-db') or False
    RDF_URL = arguments.get(
        '--rdf-url'
    ) or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
    BOOKS = arguments.get('--books') or ''
    ZTITLE = arguments.get('--zim-title')
    ZDESC = arguments.get('--zim-desc')

    # create tmp dir
    path('tmp').mkdir_p()

    LANGUAGES = [
        x.strip().lower()
        for x in (arguments.get('--languages') or '').split(',') if x.strip()
    ]
    # special shortcuts for "all"
    if arguments.get('--formats') in ['all', None]:
        FORMATS = ['epub', 'pdf']
    else:
        FORMATS = [
            x.strip().lower()
            for x in (arguments.get('--formats') or '').split(',')
            if x.strip()
        ]

    try:
        BOOKS = [bid for bid in BOOKS.split(',')]
        f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()])
        books = []
        for i in BOOKS:
            blst = f(i)
            if len(blst) > 1:
                blst = range(blst[0], blst[1] + 1)
            books.extend(blst)
        BOOKS = list(set(books))
    except Exception as e:
        logger.error(e)
        BOOKS = []

    # no arguments, default to --complete
    if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM):
        COMPLETE_DUMP = True

    if COMPLETE_DUMP:
        DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \
            DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True

    if DO_CHECKDEPS:
        logger.info("CHECKING for dependencies on the system")
        if not check_dependencies()[0]:
            logger.error("Exiting...")
            sys.exit(1)

    if DO_PREPARE:
        logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)

    if DO_PARSE:
        logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
        setup_database(wipe=WIPE_DB)
        parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS)

    if DO_DOWNLOAD:
        logger.info("DOWNLOADING ebooks from mirror using filters")
        download_all_books(url_mirror=URL_MIRROR,
                           download_cache=DL_CACHE,
                           languages=LANGUAGES,
                           formats=FORMATS,
                           only_books=BOOKS)

    if DO_EXPORT:
        logger.info("EXPORTING ebooks to static folder (and JSON)")
        export_all_books(static_folder=STATIC_FOLDER,
                         download_cache=DL_CACHE,
                         languages=LANGUAGES,
                         formats=FORMATS,
                         only_books=BOOKS)

    if DO_ZIM:
        if not check_dependencies()[1]:
            logger.error("You don't have zimwriterfs installed.")
            sys.exit(1)
        logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER))
        build_zimfile(static_folder=STATIC_FOLDER,
                      zim_path=ZIM_FILE,
                      languages=LANGUAGES,
                      formats=FORMATS,
                      only_books=BOOKS,
                      title=ZTITLE,
                      description=ZDESC)