def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the set fields interface with a user who has per library restrictions') data = load_payload_data(rd) try: changes, loaded_book_ids = data['changes'], frozenset(map(int, data.get('loaded_book_ids', ()))) all_dirtied = bool(data.get('all_dirtied')) if not isinstance(changes, dict): raise TypeError('changes must be a dict') except Exception: raise HTTPBadRequest( '''Data must be of the form {'changes': {'title': 'New Title', ...}, 'loaded_book_ids':[book_id1, book_id2, ...]'}''') dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = from_base64_bytes(cdata.split(',', 1)[-1]) except Exception: raise HTTPBadRequest('Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) added_formats = changes.pop('added_formats', False) if added_formats: for data in added_formats: try: fmt = data['ext'].upper() except Exception: raise HTTPBadRequest('Format has no extension') if fmt: try: fmt_data = from_base64_bytes(data['data_url'].split(',', 1)[-1]) except Exception: raise HTTPBadRequest('Format data is not valid base64 encoded data') if db.add_format(book_id, fmt, ReadOnlyFileBuffer(fmt_data)): dirtied.add(book_id) removed_formats = changes.pop('removed_formats', False) if removed_formats: db.remove_formats({book_id: list(removed_formats)}) dirtied.add(book_id) for field, value in iteritems(changes): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) all_ids = dirtied if all_dirtied else (dirtied & loaded_book_ids) all_ids |= {book_id} return {bid: book_as_json(db, bid) for bid in all_ids}
def __call__(self, oeb, opts): self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) from polyglot.binary import from_base64_bytes try: data = from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data URI, ignoring it') continue else: data = urlunquote(data) from imghdr import what fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown format, ignoring') continue img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def decode_thumbnail(tup): ''' Decode an encoded thumbnail into its 3 component parts ''' if tup is None: return None return (tup[0], tup[1], from_base64_bytes(tup[2]))
def decode_metadata(x, for_json): from polyglot.binary import from_base64_bytes from calibre.ebooks.metadata.book.serialize import metadata_from_dict obj = metadata_from_dict(x) if for_json and obj.cover_data and obj.cover_data[1]: obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1]) return obj
def update_booklist(self, bl, bl_index): if bl_index not in self.record_roots: return debug_print('Updating JSON cache:', bl_index) playlist_map = self.build_id_playlist_map(bl_index) root = self.record_roots[bl_index] lpath_map = self.build_lpath_map(root) for book in bl: record = lpath_map.get(book.lpath, None) if record is not None: for thumbnail in record.xpath( 'descendant::*[local-name()="thumbnail"]'): for img in thumbnail.xpath( 'descendant::*[local-name()="jpeg"]|' 'descendant::*[local-name()="png"]'): if img.text: try: raw = from_base64_bytes(img.text.strip()) except Exception: continue book.thumbnail = raw break break book.device_collections = playlist_map.get(book.lpath, []) debug_print('Finished updating JSON cache:', bl_index)
def get_stored_annotations(container, bookmark_data): raw = bookmark_data or b'' if not raw: return if raw.startswith(EPUB_FILE_TYPE_MAGIC): raw = raw[len(EPUB_FILE_TYPE_MAGIC):].replace(b'\n', b'') for annot in parse_annotations(from_base64_bytes(raw)): yield annot return from calibre.ebooks.oeb.iterator.bookmarks import parse_bookmarks for bm in parse_bookmarks(raw): if bm['type'] == 'cfi' and isinstance(bm['pos'], unicode_type): spine_index = (1 + bm['spine']) * 2 epubcfi = 'epubcfi(/{}/{})'.format(spine_index, bm['pos'].lstrip('/')) title = bm.get('title') if title and title != 'calibre_current_page_bookmark': yield { 'type': 'bookmark', 'title': title, 'pos': epubcfi, 'pos_type': 'epubcfi', 'timestamp': EPOCH } else: yield { 'type': 'last-read', 'pos': epubcfi, 'pos_type': 'epubcfi', 'timestamp': EPOCH }
def __call__(self, oeb, opts): self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) from polyglot.binary import from_base64_bytes try: data = from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data URI, ignoring it') continue else: data = urlunquote(data) from imghdr import what fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown format, ignoring') continue img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the set fields interface with a user who has per library restrictions') data = load_payload_data(rd) try: changes, loaded_book_ids = data['changes'], frozenset(map(int, data.get('loaded_book_ids', ()))) all_dirtied = bool(data.get('all_dirtied')) if not isinstance(changes, dict): raise TypeError('changes must be a dict') except Exception: raise HTTPBadRequest( '''Data must be of the form {'changes': {'title': 'New Title', ...}, 'loaded_book_ids':[book_id1, book_id2, ...]'}''') dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = from_base64_bytes(cdata.split(',', 1)[-1]) except Exception: raise HTTPBadRequest('Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) for field, value in iteritems(changes): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) all_ids = dirtied if all_dirtied else (dirtied & loaded_book_ids) all_ids |= {book_id} return {bid: book_as_json(db, bid) for bid in all_ids}
def decode_thumbnail(tup): ''' Decode an encoded thumbnail into its 3 component parts ''' if tup is None: return None return (tup[0], tup[1], from_base64_bytes(tup[2]))
def update_booklist(self, bl, bl_index): if bl_index not in self.record_roots: return debug_print('Updating JSON cache:', bl_index) playlist_map = self.build_id_playlist_map(bl_index) root = self.record_roots[bl_index] lpath_map = self.build_lpath_map(root) for book in bl: record = lpath_map.get(book.lpath, None) if record is not None: for thumbnail in record.xpath( 'descendant::*[local-name()="thumbnail"]'): for img in thumbnail.xpath( 'descendant::*[local-name()="jpeg"]|' 'descendant::*[local-name()="png"]'): if img.text: try: raw = from_base64_bytes(img.text.strip()) except Exception: continue book.thumbnail = raw break break book.device_collections = playlist_map.get(book.lpath, []) debug_print('Finished updating JSON cache:', bl_index)
def decode_metadata(x, for_json): from polyglot.binary import from_base64_bytes from calibre.ebooks.metadata.book.serialize import metadata_from_dict obj = metadata_from_dict(x) if for_json and obj.cover_data and obj.cover_data[1]: obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1]) return obj
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': path = unquote(purl.path) if iswindows and path.startswith('/'): path = path[1:] src = lopen(path, 'rb') filename = os.path.basename(path) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = from_base64_bytes(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError( 'The external resource {} looks like a HTML document ({})'. format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError( 'The external resource {} is not of a known type'.format( url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': src = lopen(purl.path, 'rb') filename = os.path.basename(src) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = from_base64_bytes(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError('The external resource {} is not of a known type'.format(url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def main(args): # Ensure we can continue to function if GUI is closed os.environ.pop('CALIBRE_WORKER_TEMP_DIR', None) reset_base_dir() if iswindows: # Ensure that all instances are grouped together in the task bar. This # prevents them from being grouped with viewer/editor process when # launched from within calibre, as both use calibre-parallel.exe set_app_uid(STORE_DIALOG_APP_UID) data = args[-1] data = json.loads(from_base64_bytes(data)) override = 'calibre-gui' if islinux else None app = Application(args, override_program_name=override) m = Main(data) m.show(), m.raise_() app.exec() del m del app
def base64_decode(raw): from io import BytesIO from polyglot.binary import from_base64_bytes # First try the python implementation as it is faster try: return from_base64_bytes(raw) except Exception: pass # Try a more robust version (adapted from FBReader sources) A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=') raw = bytearray(raw) out = BytesIO() pos = 0 while pos < len(raw): tot = 0 i = 0 while i < 4 and pos < len(raw): byt = raw[pos] pos += 1 num = 0 if A <= byt <= Z: num = byt - A elif a <= byt <= z: num = byt - a + 26 elif zero <= byt <= nine: num = byt - zero + 52 else: num = {plus:62, slash:63, equal:64}.get(byt, None) if num is None: # Ignore this byte continue tot += num << (6 * (3 - i)) i += 1 triple = bytearray(3) for j in (2, 1, 0): triple[j] = tot & 0xff tot >>= 8 out.write(bytes(triple)) return out.getvalue()
def decode_data_url(url): return from_base64_bytes(url.partition(',')[2])
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = from_base64_bytes(iurl.partition(',')[-1]) except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == b'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img' + unicode_type(c)) data = self.preprocess_image_ext( data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname + '.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg', 'jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image ' + iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname + '.' + itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('author', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x unique_authors = [] # remove duplicates while retaining order for f in [fix_author(x) for x in authors]: if f not in unique_authors: unique_authors.append(f) mi = Metadata(title, unique_authors) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list( filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = from_base64_bytes(m[COVER_KEY]) fmt, w, h = identify(data) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def decode_data_url(url): return from_base64_bytes(url.partition(',')[2])
def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): os.mkdir(diskpath) c = 0 for tag in soup.findAll('img', src=True): iurl = tag['src'] if iurl.startswith('data:image/'): try: data = from_base64_bytes(iurl.partition(',')[-1]) except Exception: self.log.exception('Failed to decode embedded image') continue else: if callable(self.image_url_processor): iurl = self.image_url_processor(baseurl, iurl) if not urlsplit(iurl).scheme: iurl = urljoin(baseurl, iurl, False) with self.imagemap_lock: if iurl in self.imagemap: tag['src'] = self.imagemap[iurl] continue try: data = self.fetch_url(iurl) if data == 'GIF89a\x01': # Skip empty GIF files as PIL errors on them anyway continue except Exception: self.log.exception('Could not fetch image ', iurl) continue c += 1 fname = ascii_filename('img'+str(c)) data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data if data is None: continue itype = what(None, data) if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): # SVG image imgpath = os.path.join(diskpath, fname+'.svg') with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath else: try: # Ensure image is valid img = image_from_data(data) if itype not in {'png', 'jpg', 'jpeg'}: itype = 'png' if itype == 'gif' else 'jpeg' data = image_to_data(img, fmt=itype) if self.compress_news_images and itype in {'jpg','jpeg'}: try: data = self.rescale_image(data) except Exception: self.log.exception('failed to compress image '+iurl) # Moon+ apparently cannot handle .jpeg files if itype == 'jpeg': itype = 'jpg' imgpath = os.path.join(diskpath, fname+'.'+itype) with self.imagemap_lock: self.imagemap[iurl] = imgpath with open(imgpath, 'wb') as x: x.write(data) tag['src'] = imgpath except Exception: traceback.print_exc() continue
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('author', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x unique_authors = [] # remove duplicates while retaining order for f in [fix_author(x) for x in authors]: if f not in unique_authors: unique_authors.append(f) mi = Metadata(title, unique_authors) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = from_base64_bytes(m[COVER_KEY]) fmt, w, h = identify(data) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi