Beispiel #1
0
def download_all_books(url_mirror,
                       download_cache,
                       languages=[],
                       formats=[],
                       only_books=[],
                       force=False):

    available_books = get_list_of_filtered_books(languages=languages,
                                                 formats=formats,
                                                 only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info(
            "\tDownloading content files for Book #{id}".format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}".format(
                    fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = [
                    'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                    '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                    '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                    '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm',
                    'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip',
                    '8indn10h.zip', '8resp10h.zip', '20004-h.htm',
                    '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip',
                    '{id}-h.zip', '8mort10h.zip'
                ]
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(
                    mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}".format(
                    format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}".format(
                format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while (urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error(
                            "ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath,
                                       book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp
                pp(allurls)
                continue
Beispiel #2
0
def download_all_books(url_mirror, download_cache,
                       languages=[], formats=[],
                       only_books=[], force=False):

    available_books = get_list_of_filtered_books(
        languages=languages,
        formats=formats,
        only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info("\tDownloading content files for Book #{id}"
                    .format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}"
                             .format(fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                            '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                            '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                            '8regr10h.zip', '{id}.html.noimages',
                            '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip',
                            '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip',
                            '20004-h.htm', '8indn10h.htm', '8memo10h.zip',
                            'fondu10h.zip', '{id}-h.zip', '8mort10h.zip']
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}"
                             .format(format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}"
                         .format(format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while(urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error("ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath, book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp ; pp(allurls)
                continue
Beispiel #3
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
            if parser.last_name:
                author_record.last_name
            if parser.first_name:
                author_record.first_names = parser.first_name
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
        except:
            author_record = Author.create(
                gut_id=parser.author_id,
                last_name=parser.last_name,
                first_names=parser.first_name,
                birth_year=parser.birth_year,
                death_year=parser.death_year)
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id='216')

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except:
        license_record = None

    # Insert book
    book_record = Book.create(
        id=parser.gid,
        title=parser.title.strip(),
        subtitle=parser.subtitle.strip(),
        author=author_record,  # foreign key
        license=license_record,  # foreign key
        language=parser.language.strip(),
        downloads=parser.downloads
    )

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith('text/plain'):
            mime = re.sub(r'; charset=[a-z0-9-]+', '', mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r'' + parser.gid, '{id}', file_type)
        pattern = pattern.split('/')[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() \
            and mime in [FORMAT_MATRIX.get(f)
                         for f in BAD_BOOKS_FORMATS.get(bid)]:
            logger.error("\t**** EXCLUDING **** {} for book #{} from list."
                         .format(mime, bid))
            continue


        format_record = Format.get_or_create(
            mime=mime,
            images=file_type.endswith(
                '.images') or parser.file_types[file_type] == 'application/pdf',
            pattern=pattern)

        # Insert book format
        BookFormat.create(
            book=book_record,  # foreign key
            format=format_record  # foreign key
        )