def nb_by_fmt(fmt): return sum([ 1 for book in books if BookFormat.select(BookFormat, Book, Format).join(Book).switch( BookFormat).join(Format).where(Book.id == book.id).where( Format.mime == FORMAT_MATRIX.get(fmt)).count() ])
def main_formats_for(book): fmts = [fmt.format.mime for fmt in BookFormat.select(BookFormat, Book, Format) .join(Book).switch(BookFormat) .join(Format) .where(Book.id == book.id)] return [k for k, v in FORMAT_MATRIX.items() if v in fmts]
def nb_by_fmt(fmt): return sum([1 for book in books if BookFormat.select(BookFormat, Book, Format) .join(Book).switch(BookFormat) .join(Format) .where(Book.id == book.id) .where(Format.mime == FORMAT_MATRIX.get(fmt)) .count()])
def get_urls(book): """ Get all possible urls that could point to the book on either of the two mirrors. param: book: The book you want the possible urls from returns: a list of all possible urls sorted by their probability """ filtered_book = [bf.format for bf in BookFormat.select().where(BookFormat.book == book)] # Strip out the encoding of the file f = lambda x: x.mime.split(';')[0].strip() available_formats = [{x.pattern.format(id=book.id): {'mime': f(x), 'id': book.id}} for x in filtered_book if f(x) in FORMAT_MATRIX.values()] files = sort_by_mime_type(available_formats) return build_urls(files)
def get_urls(book): """ Get all possible urls that could point to the book on either of the two mirrors. param: book: The book you want the possible urls from returns: a list of all possible urls sorted by their probability """ filtered_book = [ bf.format for bf in BookFormat.select().where(BookFormat.book == book) ] # Strip out the encoding of the file f = lambda x: x.mime.split(';')[0].strip() available_formats = [{ x.pattern.format(id=book.id): { 'mime': f(x), 'id': book.id } } for x in filtered_book if f(x) in FORMAT_MATRIX.values()] files = sort_by_mime_type(available_formats) return build_urls(files)
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info("\tDownloading content files for Book #{id}" .format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}" .format(fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip'] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}" .format(format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}" .format(format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while(urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp ; pp(allurls) continue
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = [ 'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip' ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while (urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error( "ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp pp(allurls) continue
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) if parser.last_name: author_record.last_name if parser.first_name: author_record.first_names = parser.first_name if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() except: author_record = Author.create( gut_id=parser.author_id, last_name=parser.last_name, first_names=parser.first_name, birth_year=parser.birth_year, death_year=parser.death_year) else: # No author, set Anonymous author_record = Author.get(gut_id='216') # Get license try: license_record = License.get(name=parser.license) except: license_record = None # Insert book book_record = Book.create( id=parser.gid, title=parser.title.strip(), subtitle=parser.subtitle.strip(), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads ) # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith('text/plain'): mime = re.sub(r'; charset=[a-z0-9-]+', '', mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r'' + parser.gid, '{id}', file_type) pattern = pattern.split('/')[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() \ and mime in [FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)]: logger.error("\t**** EXCLUDING **** {} for book #{} from list." .format(mime, bid)) continue format_record = Format.get_or_create( mime=mime, images=file_type.endswith( '.images') or parser.file_types[file_type] == 'application/pdf', pattern=pattern) # Insert book format BookFormat.create( book=book_record, # foreign key format=format_record # foreign key )