def extract_resources(self): self.resource_map = [] known_types = {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'} for i, rec in enumerate(self.resource_records): sig = rec.raw[:4] payload = rec.raw ext = 'dat' prefix = 'binary' suffix = '' if sig in {b'HUFF', b'CDIC', b'INDX'}: continue # TODO: Ignore CNCX records as well if sig == b'FONT': font = read_font_record(rec.raw) if font['err']: raise ValueError('Failed to read font record: %s Headers: %s'%( font['err'], font['headers'])) payload = (font['font_data'] if font['font_data'] else font['raw_data']) prefix, ext = 'fonts', font['ext'] elif sig not in known_types: q = what(None, rec.raw) if q: prefix, ext = 'images', q if prefix == 'binary': if sig == b'\xe9\x8e\r\n': suffix = '-EOF' elif sig in known_types: suffix = '-' + sig.decode('ascii') self.resource_map.append(('%s/%06d%s.%s'%(prefix, i, suffix, ext), payload))
def resource_adder(self, link_, base=None): from polyglot.urllib import quote link, frag = self.link_to_local_path(link_, base=base) if link is None: return link_ try: if base and not os.path.isabs(link): link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ if not os.access(link, os.R_OK): return link_ if os.path.isdir(link): self.log.warn(link_, 'is a link to a directory. Ignoring.') return link_ if not self.is_case_sensitive(tempfile.gettempdir()): link = link.lower() if link not in self.added_resources: bhref = os.path.basename(link) id, href = self.oeb.manifest.generate( id='added', href=sanitize_file_name(bhref)) guessed = self.guess_type(href)[0] media_type = guessed or self.BINARY_MIME if media_type == 'text/plain': self.log.warn('Ignoring link to text file %r' % link_) return None if media_type == self.BINARY_MIME: # Check for the common case, images try: img = what(link) except EnvironmentError: pass else: if img: media_type = self.guess_type( 'dummy.' + img)[0] or self.BINARY_MIME self.oeb.log.debug('Added', link) self.oeb.container = self.DirContainer(os.path.dirname(link), self.oeb.log, ignore_opf=True) # Load into memory item = self.oeb.manifest.add(id, href, media_type) # bhref refers to an already existing file. The read() method of # DirContainer will call unquote on it before trying to read the # file, therefore we quote it here. if isinstance(bhref, unicode_type): bhref = bhref.encode('utf-8') item.html_input_href = as_unicode(quote(bhref)) if guessed in self.OEB_STYLES: item.override_css_fetch = partial(self.css_import_handler, os.path.dirname(link)) item.data self.added_resources[link] = href nlink = self.added_resources[link] if frag: nlink = '#'.join((nlink, frag)) return nlink
def read_image_data(self, fname, base=None): if fname.startswith('file://'): src = fname[len('file://'):] if iswindows and src and src[0] == '/': src = src[1:] if not src or not os.path.exists(src): raise LinkedImageNotFound(src) with open(src, 'rb') as rawsrc: raw = rawsrc.read() else: try: raw = self.docx.read(fname) except KeyError: raise LinkedImageNotFound(fname) base = base or image_filename(fname.rpartition('/')[-1]) or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' if ext == 'emf': # For an example, see: https://bugs.launchpad.net/bugs/1224849 self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname) from calibre.utils.wmf.emf import emf_unwrap try: raw = emf_unwrap(raw) except Exception: self.log.exception('Failed to extract embedded raster image from EMF') else: ext = 'png' base = base.rpartition('.')[0] if not base: base = 'image' base += '.' + ext return raw, base
def generate_filename(self, rid, base=None, rid_map=None): rid_map = self.rid_map if rid_map is None else rid_map fname = rid_map[rid] if fname in self.used: return self.used[fname] raw = self.docx.read(fname) base = base or ascii_filename(rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' if ext == 'emf': # For an example, see: https://bugs.launchpad.net/bugs/1224849 self.log('Found an EMF image: %s, trying to extract embedded raster image' % base) from calibre.utils.wmf.emf import emf_unwrap try: raw = emf_unwrap(raw) except Exception as e: self.log.exception('Failed to extract embedded raster image from EMF') else: ext = 'png' base = base.rpartition('.')[0] if not base: base = 'image' base += '.' + ext exists = frozenset(self.used.itervalues()) c = 1 name = base while name in exists: n, e = base.rpartition('.')[0::2] name = '%s-%d.%s' % (n, c, e) c += 1 self.used[fname] = name with open(os.path.join(self.dest_dir, name), 'wb') as f: f.write(raw) self.all_images.add('images/' + name) return name
def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) image_index = 0 self.image_names = [] start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers start = 0 for i in range(start, self.num_sections): if i in processed_records: continue processed_records.append(i) data = self.sections[i][0] image_index += 1 if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: # This record is a known non image type, not need to try to # load the image continue path = os.path.join(output_dir, '%05d.jpg' % image_index) try: if what(None, data) not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: continue save_cover_data_to(data, path, minify_to=(10000, 10000)) except Exception: continue self.image_names.append(os.path.basename(path))
def return_raster_image(path): from calibre.utils.imghdr import what if os.access(path, os.R_OK): with open(path, 'rb') as f: raw = f.read() if what(None, raw) not in (None, 'svg'): return raw
def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the set fields interface with a user who has per library restrictions') data = load_payload_data(rd) try: changes, loaded_book_ids = data['changes'], frozenset(map(int, data.get('loaded_book_ids', ()))) all_dirtied = bool(data.get('all_dirtied')) if not isinstance(changes, dict): raise TypeError('changes must be a dict') except Exception: raise HTTPBadRequest( '''Data must be of the form {'changes': {'title': 'New Title', ...}, 'loaded_book_ids':[book_id1, book_id2, ...]'}''') dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = from_base64_bytes(cdata.split(',', 1)[-1]) except Exception: raise HTTPBadRequest('Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) for field, value in iteritems(changes): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) all_ids = dirtied if all_dirtied else (dirtied & loaded_book_ids) all_ids |= {book_id} return {bid: book_as_json(db, bid) for bid in all_ids}
def extract_images(self, picts): from calibre.utils.imghdr import what from binascii import unhexlify self.log('Extracting images...') with open(picts, 'rb') as f: raw = f.read() picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw)) hex_pat = re.compile(br'[^a-fA-F0-9]') encs = [hex_pat.sub(b'', pict) for pict in picts] count = 0 imap = {} for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = unhexlify(enc) fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 name = '%04d.%s' % (count, fmt) with open(name, 'wb') as f: f.write(data) imap[count] = name # with open(name+'.hex', 'wb') as f: # f.write(enc) return self.convert_images(imap)
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.img import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except Exception: raw = b'' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4 * 1024 * 1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except Exception: data = b'' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data)) except Exception: log.exception('Failed to read MOBI cover') return mi
def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the set fields interface with a user who has per library restrictions') data = load_payload_data(rd) try: changes, loaded_book_ids = data['changes'], frozenset(map(int, data.get('loaded_book_ids', ()))) all_dirtied = bool(data.get('all_dirtied')) if not isinstance(changes, dict): raise TypeError('changes must be a dict') except Exception: raise HTTPBadRequest( '''Data must be of the form {'changes': {'title': 'New Title', ...}, 'loaded_book_ids':[book_id1, book_id2, ...]'}''') dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = standard_b64decode(cdata.split(',', 1)[-1].encode('ascii')) except Exception: raise HTTPBadRequest('Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) for field, value in iteritems(changes): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) all_ids = dirtied if all_dirtied else (dirtied & loaded_book_ids) all_ids |= {book_id} return {bid: book_as_json(db, bid) for bid in all_ids}
def read_image_data(self, fname, base=None): if fname.startswith("file://"): src = fname[len("file://") :] if iswindows and src and src[0] == "/": src = src[1:] if not src or not os.path.exists(src): raise LinkedImageNotFound(src) with open(src, "rb") as rawsrc: raw = rawsrc.read() else: raw = self.docx.read(fname) base = base or ascii_filename(fname.rpartition("/")[-1]).replace(" ", "_") or "image" ext = what(None, raw) or base.rpartition(".")[-1] or "jpeg" if ext == "emf": # For an example, see: https://bugs.launchpad.net/bugs/1224849 self.log("Found an EMF image: %s, trying to extract embedded raster image" % fname) from calibre.utils.wmf.emf import emf_unwrap try: raw = emf_unwrap(raw) except Exception: self.log.exception("Failed to extract embedded raster image from EMF") else: ext = "png" base = base.rpartition(".")[0] if not base: base = "image" base += "." + ext return raw, base
def extract_images(self, picts): from calibre.utils.imghdr import what self.log('Extracting images...') with open(picts, 'rb') as f: raw = f.read() picts = filter(len, re.findall(r'\{\\pict([^}]+)\}', raw)) hex = re.compile(r'[^a-fA-F0-9]') encs = [hex.sub('', pict) for pict in picts] count = 0 imap = {} for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 name = u'%04d.%s' % (count, fmt) with open(name, 'wb') as f: f.write(data) imap[count] = name # with open(name+'.hex', 'wb') as f: # f.write(enc) return self.convert_images(imap)
def __init__(self, mf): for x in ('raw', 'palmdb', 'record_headers', 'records', 'mobi_header', 'huffman_record_nums',): setattr(self, x, getattr(mf, x)) self.index_header = self.index_record = None self.indexing_record_nums = set() pir = getattr(self.mobi_header, 'primary_index_record', NULL_INDEX) if pir != NULL_INDEX: self.index_header = IndexHeader(self.records[pir]) numi = self.index_header.index_count self.cncx = CNCX(self.records[ pir+1+numi:pir+1+numi+self.index_header.num_of_cncx_blocks], self.index_header.index_encoding) self.index_record = IndexRecord(self.records[pir+1:pir+1+numi], self.index_header, self.cncx) self.indexing_record_nums = set(range(pir, pir+1+numi+self.index_header.num_of_cncx_blocks)) self.secondary_index_record = self.secondary_index_header = None sir = self.mobi_header.secondary_index_record if sir != NULL_INDEX: self.secondary_index_header = SecondaryIndexHeader(self.records[sir]) numi = self.secondary_index_header.index_count self.indexing_record_nums.add(sir) self.secondary_index_record = IndexRecord( self.records[sir+1:sir+1+numi], self.secondary_index_header, self.cncx) self.indexing_record_nums |= set(range(sir+1, sir+1+numi)) ntr = self.mobi_header.number_of_text_records fii = self.mobi_header.first_image_index self.text_records = [TextRecord(r, self.records[r], self.mobi_header.extra_data_flags, mf.decompress6) for r in range(1, min(len(self.records), ntr+1))] self.image_records, self.binary_records = [], [] self.font_records = [] image_index = 0 for i in range(self.mobi_header.first_resource_record, min(self.mobi_header.last_resource_record, len(self.records))): if i in self.indexing_record_nums or i in self.huffman_record_nums: continue image_index += 1 r = self.records[i] fmt = None if i >= fii and r.raw[:4] not in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE', b'FONT', b'CRES', b'CONT', b'CMET'}: try: fmt = what(None, r.raw) except: pass if fmt is not None: self.image_records.append(ImageRecord(image_index, r, fmt)) elif r.raw[:4] == b'FONT': self.font_records.append(FontRecord(i, r)) else: self.binary_records.append(BinaryRecord(i, r)) if self.index_record is not None: self.tbs_indexing = TBSIndexing(self.text_records, self.index_record.indices, self.mobi_header.type_raw)
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.magick.draw import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data, 'cover.jpg', return_data=True)) except Exception: log.exception('Failed to read MOBI cover') return mi
def find_imgtype(data): imgtype = what(None, data) if imgtype is None: try: imgtype = identify_data(data)[2] except Exception: imgtype = 'unknown' return imgtype
def resource_adder(self, link_, base=None): from urllib import quote link, frag = self.link_to_local_path(link_, base=base) if link is None: return link_ try: if base and not os.path.isabs(link): link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ if not os.access(link, os.R_OK): return link_ if os.path.isdir(link): self.log.warn(link_, 'is a link to a directory. Ignoring.') return link_ if not self.is_case_sensitive(tempfile.gettempdir()): link = link.lower() if link not in self.added_resources: bhref = os.path.basename(link) id, href = self.oeb.manifest.generate(id='added', href=bhref) guessed = self.guess_type(href)[0] media_type = guessed or self.BINARY_MIME if media_type == 'text/plain': self.log.warn('Ignoring link to text file %r'%link_) return None if media_type == self.BINARY_MIME: # Check for the common case, images try: img = what(link) except EnvironmentError: pass else: if img: media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME self.oeb.log.debug('Added', link) self.oeb.container = self.DirContainer(os.path.dirname(link), self.oeb.log, ignore_opf=True) # Load into memory item = self.oeb.manifest.add(id, href, media_type) # bhref refers to an already existing file. The read() method of # DirContainer will call unquote on it before trying to read the # file, therefore we quote it here. if isinstance(bhref, unicode): bhref = bhref.encode('utf-8') item.html_input_href = quote(bhref).decode('utf-8') if guessed in self.OEB_STYLES: item.override_css_fetch = partial( self.css_import_handler, os.path.dirname(link)) item.data self.added_resources[link] = href nlink = self.added_resources[link] if frag: nlink = '#'.join((nlink, frag)) return nlink
def mobify_image(data): 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' fmt = what(None, data) if fmt == 'png': im = Image() im.load(data) data = im.export('gif') return data
def mobify_image(data): "Convert PNG images to GIF as the idiotic Kindle cannot display some PNG" fmt = what(None, data) if fmt == "png": im = Image() im.load(data) data = im.export("gif") return data
def cf(): files = choose_files(d, 'select link file', _('Choose file'), select_only_single_file=True) if files: path = files[0] d.url.setText(path) if path and os.path.exists(path): with lopen(path, 'rb') as f: q = what(f) is_image = q in {'jpeg', 'png', 'gif'} d.treat_as_image.setChecked(is_image)
def image_from_data(data): ' Create an image object from data, which should be a bytestring. ' if isinstance(data, QImage): return data i = QImage() if not i.loadFromData(data): if what(None, data) == 'jxr': return load_jxr_data(data) raise NotImage('Not a valid image') return i
def image_data_to_url(data, base='cover'): from calibre.utils.imghdr import what ans = BytesIO(data) ext = what(None, data) if not ext: if data.startswith(b'%PDF-'): ext = 'pdf' else: ext = 'jpg' ans.name = 'cover.' + ext return ans
def mobify_image(data): 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' fmt = what(None, data) if fmt == 'png': from PIL import Image im = Image.open(BytesIO(data)) buf = BytesIO() im.save(buf, 'gif') data = buf.getvalue() return data
def read_cover(mi): if mi.cover_data and mi.cover_data[1]: return mi if mi.cover: try: with lopen(mi.cover, 'rb') as f: cd = f.read() mi.cover_data = what(None, cd), cd except EnvironmentError: pass return mi
def image_from_data(data): ' Create an image object from data, which should be a bytestring. ' if isinstance(data, QImage): return data i = QImage() if not i.loadFromData(data): q = what(None, data) if q == 'jxr': return load_jxr_data(data) raise NotImage('Not a valid image (detected type: {})'.format(q)) return i
def mobify_image(data): 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' fmt = what(None, data) if fmt == 'png': if not isinstance(data, StringIO): data = StringIO(data) im = Image.open(data) data = StringIO() im.save(data, 'GIF') data = data.getvalue() return data
def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden( 'Cannot use the set fields interface with a user who has per library restrictions' ) raw = rd.read() ct = rd.inheaders.get('Content-Type', all=True) ct = {x.lower().partition(';')[0] for x in ct} try: if MSGPACK_MIME in ct: data = msgpack_loads(raw) elif 'application/json' in ct: data = json_loads(raw) else: raise HTTPBadRequest('Only JSON or msgpack requests are supported') except Exception: raise HTTPBadRequest('Invalid encoded data') try: changes, loaded_book_ids = data['changes'], frozenset( map(int, data.get('loaded_book_ids', ()))) all_dirtied = bool(data.get('all_dirtied')) if not isinstance(changes, dict): raise TypeError('changes must be a dict') except Exception: raise HTTPBadRequest( '''Data must be of the form {'changes': {'title': 'New Title', ...}, 'loaded_book_ids':[book_id1, book_id2, ...]'}''' ) dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = standard_b64decode( cdata.split(',', 1)[-1].encode('ascii')) except Exception: raise HTTPBadRequest( 'Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) for field, value in changes.iteritems(): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) all_ids = dirtied if all_dirtied else (dirtied & loaded_book_ids) all_ids |= {book_id} return {bid: book_as_json(db, book_id) for bid in all_ids}
def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) image_index = 0 self.image_names = [] image_name_map = {} start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers start = 0 for i in range(start, self.num_sections): if i in processed_records: continue processed_records.append(i) data = self.sections[i][0] image_index += 1 if data[:4] in { b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE' }: # This record is a known non image type, no need to try to # load the image continue try: imgfmt = what(None, data) except Exception: continue if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: continue if imgfmt == 'jpeg': imgfmt = 'jpg' if imgfmt == 'gif': try: data = gif_data_to_png_data(data) imgfmt = 'png' except AnimatedGIF: pass path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt)) image_name_map[image_index] = os.path.basename(path) if imgfmt == 'png': with open(path, 'wb') as f: f.write(data) else: try: save_cover_data_to(data, path, minify_to=(10000, 10000)) except Exception: continue self.image_names.append(os.path.basename(path)) return image_name_map
def extract_resources(self): resource_map = [] for x in ("fonts", "images"): os.mkdir(x) for i, sec in enumerate(self.resource_sections): fname_idx = i + 1 data = sec[0] typ = data[:4] href = None if typ in { b"FLIS", b"FCIS", b"SRCS", b"\xe9\x8e\r\n", b"RESC", b"BOUN", b"FDST", b"DATP", b"AUDI", b"VIDE", }: pass # Ignore these records elif typ == b"FONT": font = read_font_record(data) href = "fonts/%05d.%s" % (fname_idx, font["ext"]) if font["err"]: self.log.warn("Reading font record %d failed: %s" % (fname_idx, font["err"])) if font["headers"]: self.log.debug("Font record headers: %s" % font["headers"]) with open(href.replace("/", os.sep), "wb") as f: f.write(font["font_data"] if font["font_data"] else font["raw_data"]) if font["encrypted"]: self.encrypted_fonts.append(href) else: imgtype = what(None, data) if imgtype is None: from calibre.utils.magick.draw import identify_data try: imgtype = identify_data(data)[2] except Exception: imgtype = "unknown" href = "images/%05d.%s" % (fname_idx, imgtype) with open(href.replace("/", os.sep), "wb") as f: f.write(data) resource_map.append(href) return resource_map
def download_resources(browser, resource_cache, output_dir): img_counter = style_counter = 0 resources = defaultdict(list) for img in browser.css_select('img[src]', all=True): # Using javascript ensures that absolute URLs are returned, direct # attribute access does not do that src = unicode(img.evaluateJavaScript('this.src').toString()).strip() if src: resources[src].append(img) for link in browser.css_select('link[href]', all=True): lt = unicode(link.attribute('type')).strip() or 'text/css' rel = unicode(link.attribute('rel')).strip() or 'stylesheet' if lt == 'text/css' and rel == 'stylesheet': href = unicode( link.evaluateJavaScript('this.href').toString()).strip() if href: resources[href].append(link) else: link.removeFromDocument() else: link.removeFromDocument() loaded_resources = browser.wait_for_resources(resources) for url, raw in loaded_resources.iteritems(): h = hashlib.sha1(raw).digest() if h in resource_cache: href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/') else: elem = resources[url][0] if is_tag(elem, 'link'): style_counter += 1 href = 'style_%d.css' % style_counter else: img_counter += 1 ext = what(None, raw) or 'jpg' if ext == 'jpeg': ext = 'jpg' # Apparently Moon+ cannot handle .jpeg href = 'img_%d.%s' % (img_counter, ext) dest = os.path.join(output_dir, href) resource_cache[h] = dest with open(dest, 'wb') as f: f.write(raw) for elem in resources[url]: elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href) failed = set(resources) - set(loaded_resources) for url in failed: browser.log.warn('Failed to download resource:', url) for elem in resources[url]: elem.removeFromDocument()
def extract_resources(self): from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF resource_map = [] for x in ('fonts', 'images'): os.mkdir(x) for i, sec in enumerate(self.resource_sections): fname_idx = i + 1 data = sec[0] typ = data[:4] href = None if typ in { b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE' }: pass # Ignore these records elif typ == b'RESC': self.resc_data = read_resc_record(data) elif typ == b'FONT': font = read_font_record(data) href = "fonts/%05d.%s" % (fname_idx, font['ext']) if font['err']: self.log.warn('Reading font record %d failed: %s' % (fname_idx, font['err'])) if font['headers']: self.log.debug('Font record headers: %s' % font['headers']) with open(href.replace('/', os.sep), 'wb') as f: f.write(font['font_data'] if font['font_data'] else font['raw_data']) if font['encrypted']: self.encrypted_fonts.append(href) else: if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF): imgtype = what(None, data) if imgtype is None: from calibre.utils.magick.draw import identify_data try: imgtype = identify_data(data)[2] except Exception: imgtype = 'unknown' href = 'images/%05d.%s' % (fname_idx, imgtype) with open(href.replace('/', os.sep), 'wb') as f: f.write(data) resource_map.append(href) return resource_map
def download_resources(browser, resource_cache, output_dir): img_counter = style_counter = 0 resources = defaultdict(list) for img in browser.css_select("img[src]", all=True): # Using javascript ensures that absolute URLs are returned, direct # attribute access does not do that src = unicode(img.evaluateJavaScript("this.src").toString()).strip() if src: resources[src].append(img) for link in browser.css_select("link[href]", all=True): lt = unicode(link.attribute("type")).strip() or "text/css" rel = unicode(link.attribute("rel")).strip() or "stylesheet" if lt == "text/css" and rel == "stylesheet": href = unicode(link.evaluateJavaScript("this.href").toString()).strip() if href: resources[href].append(link) else: link.removeFromDocument() else: link.removeFromDocument() loaded_resources = browser.wait_for_resources(resources) for url, raw in loaded_resources.iteritems(): h = hashlib.sha1(raw).digest() if h in resource_cache: href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, "/") else: elem = resources[url][0] if is_tag(elem, "link"): style_counter += 1 href = "style_%d.css" % style_counter else: img_counter += 1 ext = what(None, raw) or "jpg" if ext == "jpeg": ext = "jpg" # Apparently Moon+ cannot handle .jpeg href = "img_%d.%s" % (img_counter, ext) dest = os.path.join(output_dir, href) resource_cache[h] = dest with open(dest, "wb") as f: f.write(raw) for elem in resources[url]: elem.setAttribute("href" if is_tag(elem, "link") else "src", href) failed = set(resources) - set(loaded_resources) for url in failed: browser.log.warn("Failed to download resource:", url) for elem in resources[url]: elem.removeFromDocument()
def generate_filename(self, rid, base=None, rid_map=None): rid_map = self.rid_map if rid_map is None else rid_map fname = rid_map[rid] if fname in self.used: return self.used[fname] if fname.startswith('file://'): src = fname[len('file://'):] if iswindows and src and src[0] == '/': src = src[1:] if not src or not os.path.exists(src): raise LinkedImageNotFound(src) with open(src, 'rb') as rawsrc: raw = rawsrc.read() else: raw = self.docx.read(fname) base = base or ascii_filename( rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' if ext == 'emf': # For an example, see: https://bugs.launchpad.net/bugs/1224849 self.log( 'Found an EMF image: %s, trying to extract embedded raster image' % base) from calibre.utils.wmf.emf import emf_unwrap try: raw = emf_unwrap(raw) except Exception as e: self.log.exception( 'Failed to extract embedded raster image from EMF') else: ext = 'png' base = base.rpartition('.')[0] if not base: base = 'image' base += '.' + ext exists = frozenset(self.used.itervalues()) c = 1 name = base while name in exists: n, e = base.rpartition('.')[0::2] name = '%s-%d.%s' % (n, c, e) c += 1 self.used[fname] = name with open(os.path.join(self.dest_dir, name), 'wb') as f: f.write(raw) self.all_images.add('images/' + name) return name
def download_resources(browser, resource_cache, output_dir): img_counter = style_counter = 0 resources = defaultdict(list) for img in browser.css_select('img[src]', all=True): # Using javascript ensures that absolute URLs are returned, direct # attribute access does not do that src = unicode(img.evaluateJavaScript('this.src') or '').strip() if src: resources[src].append(img) for link in browser.css_select('link[href]', all=True): lt = unicode(link.attribute('type')).strip() or 'text/css' rel = unicode(link.attribute('rel')).strip() or 'stylesheet' if lt == 'text/css' and rel == 'stylesheet': href = unicode(link.evaluateJavaScript('this.href') or '').strip() if href: resources[href].append(link) else: link.removeFromDocument() else: link.removeFromDocument() loaded_resources = browser.wait_for_resources(resources) for url, raw in loaded_resources.iteritems(): h = hashlib.sha1(raw).digest() if h in resource_cache: href = os.path.relpath(resource_cache[h], output_dir).replace(os.sep, '/') else: elem = resources[url][0] if is_tag(elem, 'link'): style_counter += 1 href = 'style_%d.css' % style_counter else: img_counter += 1 ext = what(None, raw) or 'jpg' if ext == 'jpeg': ext = 'jpg' # Apparently Moon+ cannot handle .jpeg href = 'img_%d.%s' % (img_counter, ext) dest = os.path.join(output_dir, href) resource_cache[h] = dest with open(dest, 'wb') as f: f.write(raw) for elem in resources[url]: elem.setAttribute('href' if is_tag(elem, 'link') else 'src', href) failed = set(resources) - set(loaded_resources) for url in failed: browser.log.warn('Failed to download resource:', url) for elem in resources[url]: elem.removeFromDocument()
def process_image(self, data): if not self.process_images: return data try: return mobify_image(data) except Exception: if 'png' != what(None, data): raise with PersistentTemporaryFile(suffix='.png') as pt: pt.write(data) try: from calibre.utils.img import optimize_png optimize_png(pt.name) data = lopen(pt.name, 'rb').read() finally: os.remove(pt.name) return mobify_image(data)
def extract_resources(self): from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF resource_map = [] for x in ('fonts', 'images'): os.mkdir(x) for i, sec in enumerate(self.resource_sections): fname_idx = i+1 data = sec[0] typ = data[:4] href = None if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: pass # Ignore these records elif typ == b'RESC': self.resc_data = read_resc_record(data) elif typ == b'FONT': font = read_font_record(data) href = "fonts/%05d.%s" % (fname_idx, font['ext']) if font['err']: self.log.warn('Reading font record %d failed: %s'%( fname_idx, font['err'])) if font['headers']: self.log.debug('Font record headers: %s'%font['headers']) with open(href.replace('/', os.sep), 'wb') as f: f.write(font['font_data'] if font['font_data'] else font['raw_data']) if font['encrypted']: self.encrypted_fonts.append(href) else: if len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF: continue imgtype = what(None, data) if imgtype is None: from calibre.utils.magick.draw import identify_data try: imgtype = identify_data(data)[2] except Exception: imgtype = 'unknown' href = 'images/%05d.%s'%(fname_idx, imgtype) with open(href.replace('/', os.sep), 'wb') as f: f.write(data) resource_map.append(href) return resource_map
def ask_link(self): d = QDialog(self) d.setWindowTitle(_('Create link')) l = QFormLayout() d.setLayout(l) d.url = QLineEdit(d) d.name = QLineEdit(d) d.setMinimumWidth(600) d.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) d.br = b = QPushButton(_('&Browse')) b.setIcon(QIcon(I('document_open.png'))) def cf(): files = choose_files(d, 'select link file', _('Choose file'), select_only_single_file=True) if files: d.url.setText(files[0]) b.clicked.connect(cf) d.la = la = QLabel( _('Enter a URL. You can also choose to create a link to a file on ' 'your computer. If the selected file is an image, it will be ' 'inserted as an image. Note that if you create a link to a file on ' 'your computer, it will stop working if the file is moved.')) la.setWordWrap(True) la.setStyleSheet('QLabel { margin-bottom: 1.5ex }') l.setWidget(0, l.SpanningRole, la) l.addRow(_('Enter &URL:'), d.url) l.addRow(_('Enter &name (optional):'), d.name) l.addRow(_('Choose a file on your computer:'), d.br) l.addRow(d.bb) d.bb.accepted.connect(d.accept) d.bb.rejected.connect(d.reject) d.resize(d.sizeHint()) link, name, is_image = None, None, False if d.exec_() == d.Accepted: link, name = unicode(d.url.text()).strip(), unicode( d.name.text()).strip() if link and os.path.exists(link): with lopen(link, 'rb') as f: q = what(f) is_image = q in {'jpeg', 'png', 'gif'} return link, name, is_image
def generate_filename(self, rid, base=None): if rid in self.used: return self.used[rid] raw = self.docx.read(self.rid_map[rid]) base = base or ascii_filename(self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_') ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' base = base.rpartition('.')[0] + '.' + ext exists = frozenset(self.used.itervalues()) c = 1 while base in exists: n, e = base.rpartition('.')[0::2] base = '%s-%d.%s' % (n, c, e) c += 1 self.used[rid] = base with open(os.path.join(self.dest_dir, base), 'wb') as f: f.write(raw) self.all_images.add('images/' + base) return base
def process_image(self, data): if not self.process_images: return data func = mobify_image if self.opts.mobi_keep_original_images else rescale_image try: return ensure_jpeg_has_jfif(func(data)) except Exception: if 'png' != what(None, data): raise with PersistentTemporaryFile(suffix='.png') as pt: pt.write(data) try: from calibre.utils.img import optimize_png optimize_png(pt.name) data = lopen(pt.name, 'rb').read() finally: os.remove(pt.name) return func(data)
def process_image(self, data): if not self.process_images: return data func = mobify_image if self.opts.mobi_keep_original_images else rescale_image try: return func(data) except Exception: if 'png' != what(None, data): raise with PersistentTemporaryFile(suffix='.png') as pt: pt.write(data) try: from calibre.utils.img import optimize_png optimize_png(pt.name) data = lopen(pt.name, 'rb').read() finally: os.remove(pt.name) return func(data)
def generate_filename(self, rid, base=None): if rid in self.used: return self.used[rid] raw = self.docx.read(self.rid_map[rid]) base = base or ascii_filename( self.rid_map[rid].rpartition('/')[-1]).replace(' ', '_') ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' base = base.rpartition('.')[0] + '.' + ext exists = frozenset(self.used.itervalues()) c = 1 while base in exists: n, e = base.rpartition('.')[0::2] base = '%s-%d.%s' % (n, c, e) c += 1 self.used[rid] = base with open(os.path.join(self.dest_dir, base), 'wb') as f: f.write(raw) self.all_images.add('images/' + base) return base
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.'+ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def ask_link(self): d = QDialog(self) d.setWindowTitle(_('Create link')) l = QFormLayout() d.setLayout(l) d.url = QLineEdit(d) d.name = QLineEdit(d) d.setMinimumWidth(600) d.bb = QDialogButtonBox(QDialogButtonBox.Ok|QDialogButtonBox.Cancel) d.br = b = QPushButton(_('&Browse')) b.setIcon(QIcon(I('document_open.png'))) def cf(): files = choose_files(d, 'select link file', _('Choose file'), select_only_single_file=True) if files: d.url.setText(files[0]) b.clicked.connect(cf) d.la = la = QLabel(_( 'Enter a URL. You can also choose to create a link to a file on ' 'your computer. If the selected file is an image, it will be ' 'inserted as an image. Note that if you create a link to a file on ' 'your computer, it will stop working if the file is moved.')) la.setWordWrap(True) la.setStyleSheet('QLabel { margin-bottom: 1.5ex }') l.setWidget(0, l.SpanningRole, la) l.addRow(_('Enter &URL:'), d.url) l.addRow(_('Enter &name (optional):'), d.name) l.addRow(_('Choose a file on your computer:'), d.br) l.addRow(d.bb) d.bb.accepted.connect(d.accept) d.bb.rejected.connect(d.reject) d.resize(d.sizeHint()) link, name, is_image = None, None, False if d.exec_() == d.Accepted: link, name = unicode(d.url.text()).strip(), unicode(d.name.text()).strip() if link and os.path.exists(link): with lopen(link, 'rb') as f: q = what(f) is_image = q in {'jpeg', 'png', 'gif'} return link, name, is_image
def extract_resources(self): self.resource_map = [] known_types = { b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE' } for i, rec in enumerate(self.resource_records): sig = rec.raw[:4] payload = rec.raw ext = 'dat' prefix = 'binary' suffix = '' if sig in {b'HUFF', b'CDIC', b'INDX'}: continue # TODO: Ignore CNCX records as well if sig == b'FONT': font = read_font_record(rec.raw) if font['err']: raise ValueError( 'Failed to read font record: %s Headers: %s' % (font['err'], font['headers'])) payload = (font['font_data'] if font['font_data'] else font['raw_data']) prefix, ext = 'fonts', font['ext'] elif sig not in known_types: q = what(None, rec.raw) if q: prefix, ext = 'images', q if prefix == 'binary': if sig == b'\xe9\x8e\r\n': suffix = '-EOF' elif sig in known_types: suffix = '-' + sig.decode('ascii') self.resource_map.append( ('%s/%06d%s.%s' % (prefix, i, suffix, ext), payload))
def __call__(self, oeb, opts): from calibre.utils.imghdr import what self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) from polyglot.binary import from_base64_bytes try: data = from_base64_bytes(data) except Exception: self.log.error( 'Found invalid base64 encoded data URI, ignoring it' ) continue else: data = urlunquote(data) data = as_bytes(data) fmt = what(None, data) if not fmt: self.log.warn( 'Image encoded as data URL has unknown format, ignoring' ) continue img.set( 'src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def cdb_set_fields(ctx, rd, book_id, library_id): db = get_db(ctx, rd, library_id) if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the set fields interface with a user who has per library restrictions') raw = rd.read() ct = rd.inheaders.get('Content-Type', all=True) ct = {x.lower().partition(';')[0] for x in ct} try: if MSGPACK_MIME in ct: data = msgpack_loads(raw) elif 'application/json' in ct: data = json_loads(raw) else: raise HTTPBadRequest('Only JSON or msgpack requests are supported') changes, loaded_book_ids = data['changes'], frozenset(map(int, data['loaded_book_ids'])) except Exception: raise HTTPBadRequest('Invalid encoded data') dirtied = set() cdata = changes.pop('cover', False) if cdata is not False: if cdata is not None: try: cdata = standard_b64decode(cdata.split(',', 1)[-1].encode('ascii')) except Exception: raise HTTPBadRequest('Cover data is not valid base64 encoded data') try: fmt = what(None, cdata) except Exception: fmt = None if fmt not in ('jpeg', 'png'): raise HTTPBadRequest('Cover data must be either JPEG or PNG') dirtied |= db.set_cover({book_id: cdata}) for field, value in changes.iteritems(): dirtied |= db.set_field(field, {book_id: value}) ctx.notify_changes(db.backend.library_path, metadata(dirtied)) return {bid: book_as_json(db, book_id) for bid in (dirtied & loaded_book_ids) | {book_id}}
def generate_filename(self, rid, base=None, rid_map=None): rid_map = self.rid_map if rid_map is None else rid_map fname = rid_map[rid] if fname in self.used: return self.used[fname] raw = self.docx.read(fname) base = base or ascii_filename(rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' base = base.rpartition('.')[0] if not base: base = 'image' base += '.' + ext exists = frozenset(self.used.itervalues()) c = 1 name = base while name in exists: n, e = base.rpartition('.')[0::2] name = '%s-%d.%s' % (n, c, e) c += 1 self.used[fname] = name with open(os.path.join(self.dest_dir, name), 'wb') as f: f.write(raw) self.all_images.add('images/' + name) return name