def download_rdf_file(rdf_url): fname = 'rdf-files.tar.bz2' if path(fname).exists(): logger.info("\tdf-files.tar.bz2 already exists in {}".format(fname)) return fname logger.info("\tDownloading {} into {}".format(rdf_url, fname)) download_file(rdf_url, fname) return fname
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info("\tDownloading content files for Book #{id}" .format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}" .format(fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip'] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}" .format(format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}" .format(format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while(urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp ; pp(allurls) continue
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = [ 'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip' ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while (urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error( "ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp pp(allurls) continue