def download_cover(book, book_dir, s3_storage, optimizer_version): has_cover = Book.select(Book.cover_page).where(Book.id == book.id) if has_cover: # try to download optimized cover from cache if s3_storage url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id) etag = get_etag_from_url(url) downloaded_from_cache = False cover = "{}_cover_image.jpg".format(book.id) if (book_dir.joinpath("optimized").joinpath(cover).exists() or book_dir.joinpath("unoptimized").joinpath(cover).exists()): logger.debug(f"Cover already exists for book #{book.id}") return if s3_storage: logger.info( f"Trying to download cover for {book.id} from optimization cache" ) downloaded_from_cache = download_from_cache( book=book, etag=etag, book_format="cover", dest_dir=book_dir.joinpath("optimized"), s3_storage=s3_storage, optimizer_version=optimizer_version, ) if not downloaded_from_cache: logger.debug("Downloading {}".format(url)) if download_file(url, book_dir.joinpath("unoptimized").joinpath(cover)): book.cover_etag = etag book.save() else: logger.debug("No Book Cover found for Book #{}".format(book.id))
def get_list_of_filtered_books(languages, formats, only_books=[]): if len(formats): qs = (Book.select().join(BookFormat).join(Format).where( Format.mime << [FORMAT_MATRIX.get(f) for f in formats]).group_by(Book.id)) else: qs = Book.select() if len(only_books): # print(only_books) qs = qs.where(Book.id << only_books) if len(languages): qs = qs.where(Book.language << languages) return qs
def get_list_of_filtered_books(languages, formats, only_books=[]): if len(formats): qs = Book.select().join(BookFormat) \ .join(Format) \ .where(Format.mime << [FORMAT_MATRIX.get(f) for f in formats]) \ .group_by(Book.id) else: qs = Book.select() if len(only_books): # print(only_books) qs = qs.where(Book.id << only_books) if len(languages): qs = qs.where(Book.language << languages) return qs
def download_covers(book, download_cache): cover = "{}_cover.jpg".format(book.id) fpath = os.path.join(download_cache, cover) has_cover = Book.select(Book.cover_page).where(Book.id == book.id) if has_cover: title = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id) logger.debug("Downloading {}".format(title)) download_file(title, fpath) else: logger.debug("No Book Cover found for Book #{}".format(book.id)) return True
def parse_and_process_file(rdf_file, force=False): if not path(rdf_file).exists(): raise ValueError(rdf_file) gid = re.match(r".*/pg([0-9]+).rdf", rdf_file).groups()[0] if Book.get_or_none(id=int(gid)): logger.info("\tSkipping already parsed file {}".format(rdf_file)) return logger.info("\tParsing file {}".format(rdf_file)) with open(rdf_file, "r") as f: parser = RdfParser(f.read(), gid).parse() if parser.license == "None": logger.info( "\tWARN: Unusable book without any information {}".format(gid)) elif parser.title == "": logger.info("\tWARN: Unusable book without title {}".format(gid)) else: save_rdf_in_database(parser)
etext_names = ["{0:0=2d}".format(i) for i in etext_nums] etext_urls = [] for i in etext_names: etext_urls.append(os.path.join(u.build() + i, file_name)) urls.extend([url_zip, url_htm, url_html, html_utf8]) urls.extend(etext_urls) return list(set(urls)) def setup_urls(): file_with_url = os.path.join("tmp", "file_on_{}".format(UrlBuilder.SERVER_NAME)) cmd = [ "bash", "-c", "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC, file_with_url) ] exec_cmd(cmd) in_place_opt = ["-i", ".bak"] if platform.system() == "Darwin" else ["-i"] cmd = ["sed"] + in_place_opt + [r"s#.* \(.*\)$#\\1#", file_with_url] exec_cmd(cmd) field_names = ['url'] load_csv(Url, file_with_url, field_names=field_names) if __name__ == '__main__': book = Book.get(id=9) print(get_urls(book))
def get_list_of_all_languages(): return list(set(list([b.language for b in Book.select(Book.language)])))
u.with_base(UrlBuilder.BASE_THREE) file_index = index_of_substring(files, ['html', 'htm']) file_name = files[file_index]['name'] etext_nums = [] etext_nums.extend(range(90, 100)) etext_nums.extend(range(0, 6)) etext_names = ["{0:0=2d}".format(i) for i in etext_nums] etext_urls = [] for i in etext_names: etext_urls.append(os.path.join(u.build() + i, file_name)) urls.extend([url_zip, url_htm, url_html, html_utf8]) urls.extend(etext_urls) return list(set(urls)) def setup_urls(): file_with_url = os.path.join("tmp","file_on_{}".format(UrlBuilder.SERVER_NAME)) cmd = ["bash", "-c", "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC,file_with_url) ] exec_cmd(cmd) cmd = ["sed" , "-i", "s#.* \(.*\)$#\\1#", file_with_url ] exec_cmd(cmd) field_names = [ 'url' ] load_csv(Url, file_with_url, field_names = field_names) if __name__ == '__main__': book = Book.get(id=9) print(get_urls(book))
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) except Exception: try: author_record = Author.create( gut_id=parser.author_id, last_name=normalize(parser.last_name), first_names=normalize(parser.first_name), birth_year=parser.birth_year, death_year=parser.death_year, ) # concurrent workers might colide here so we retry once on IntegrityError except peewee.IntegrityError: author_record = Author.get(gut_id=parser.author_id) else: if parser.last_name: author_record.last_name = normalize(parser.last_name) if parser.first_name: author_record.first_names = normalize(parser.first_name) if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() else: # No author, set Anonymous author_record = Author.get(gut_id="216") # Get license try: license_record = License.get(name=parser.license) except Exception: license_record = None # Insert book try: book_record = Book.get(id=parser.gid) except Book.DoesNotExist: book_record = Book.create( id=parser.gid, title=normalize(parser.title.strip()), subtitle=normalize(parser.subtitle.strip()), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads, bookshelf=parser.bookshelf, cover_page=parser.cover_image, ) else: book_record.title = normalize(parser.title.strip()) book_record.subtitle = normalize(parser.subtitle.strip()) book_record.author = author_record # foreign key book_record.license = license_record # foreign key book_record.language = parser.language.strip() book_record.downloads = parser.downloads book_record.save() # insert pdf if not exists in parser.file_types # this is done as presence of PDF on server and RDF is inconsistent if not [ key for key in parser.file_types if parser.file_types[key].startswith("application/pdf") ]: parser.file_types.update({"{id}-pdf.pdf": "application/pdf"}) # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith("text/plain"): mime = re.sub(r"; charset=[a-z0-9-]+", "", mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r"" + parser.gid, "{id}", file_type) pattern = pattern.split("/")[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() and mime in [ FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid) ]: logger.error( "\t**** EXCLUDING **** {} for book #{} from list.".format( mime, bid)) continue format_record, _ = Format.get_or_create( mime=mime, images=file_type.endswith(".images") or parser.file_types[file_type] == "application/pdf", pattern=pattern, ) # Insert book format BookFormat.get_or_create( book=book_record, format=format_record # foreign key # foreign key )
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) except Exception: try: author_record = Author.create( gut_id=parser.author_id, last_name=normalize(parser.last_name), first_names=normalize(parser.first_name), birth_year=parser.birth_year, death_year=parser.death_year) # concurrent workers might colide here so we retry once on IntegrityError except peewee.IntegrityError: author_record = Author.get(gut_id=parser.author_id) else: if parser.last_name: author_record.last_name = normalize(parser.last_name) if parser.first_name: author_record.first_names = normalize(parser.first_name) if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() else: # No author, set Anonymous author_record = Author.get(gut_id='216') # Get license try: license_record = License.get(name=parser.license) except Exception: license_record = None # Insert book try: book_record = Book.get(id=parser.gid) except Book.DoesNotExist: book_record = Book.create( id=parser.gid, title=normalize(parser.title.strip()), subtitle=normalize(parser.subtitle.strip()), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads) else: book_record.title = normalize(parser.title.strip()) book_record.subtitle = normalize(parser.subtitle.strip()) book_record.author = author_record # foreign key book_record.license = license_record # foreign key book_record.language = parser.language.strip() book_record.downloads = parser.downloads book_record.save() # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith('text/plain'): mime = re.sub(r'; charset=[a-z0-9-]+', '', mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r'' + parser.gid, '{id}', file_type) pattern = pattern.split('/')[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() \ and mime in [FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)]: logger.error( "\t**** EXCLUDING **** {} for book #{} from list.".format( mime, bid)) continue format_record, _ = Format.get_or_create( mime=mime, images=file_type.endswith('.images') or parser.file_types[file_type] == 'application/pdf', pattern=pattern) # Insert book format BookFormat.get_or_create( book=book_record, # foreign key format=format_record # foreign key )