def read_image(self, href): if href not in self.images: item = self.oeb.manifest.hrefs.get( href) or self.oeb.manifest.hrefs.get(urlquote(href)) try: if item is None or not isinstance(item.data, bytes): self.log.warning('Failed to find image:', href) return except FileNotFoundError: self.log.warning('Failed to find image:', href) return try: fmt, width, height = identify(item.data) except Exception: self.log.warning('Replacing corrupted image with blank: %s' % href) item.data = I('blank.png', data=True, allow_user_override=False) fmt, width, height = identify(item.data) image_fname = 'media/' + self.create_filename(href, fmt) image_rid = self.document_relationships.add_image(image_fname) self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) item.unload_data_from_memory() return self.images[href]
def read_cover(stream, zin, mi, opfmeta, extract_cover): # search for an draw:image in a draw:frame with the name 'opf.cover' # if opf.metadata prop is false, just use the first image that # has a proper size (borrowed from docx) otext = odLoad(stream) cover_href = None cover_data = None cover_frame = None imgnum = 0 for frm in otext.topnode.getElementsByType(odFrame): img = frm.getElementsByType(odImage) if len(img) == 0: continue i_href = img[0].getAttribute('href') try: raw = zin.read(i_href) except KeyError: continue try: fmt, width, height = identify(bytes(raw)) except Exception: continue imgnum += 1 if opfmeta and frm.getAttribute('name').lower() == u'opf.cover': cover_href = i_href cover_data = (fmt, raw) cover_frame = frm.getAttribute('name') # could have upper case break if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000: # Pick the first image as the cover if it is of a suitable size cover_href = i_href cover_data = (fmt, raw) if not opfmeta: break if cover_href is not None: mi.cover = cover_href mi.odf_cover_frame = cover_frame if extract_cover: if not cover_data: raw = zin.read(cover_href) try: fmt = identify(bytes(raw))[0] except Exception: pass else: cover_data = (fmt, raw) mi.cover_data = cover_data
def read_cover(stream, zin, mi, opfmeta, extract_cover): # search for an draw:image in a draw:frame with the name 'opf.cover' # if opf.metadata prop is false, just use the first image that # has a proper size (borrowed from docx) otext = odLoad(stream) cover_href = None cover_data = None cover_frame = None imgnum = 0 for frm in otext.topnode.getElementsByType(odFrame): img = frm.getElementsByType(odImage) if len(img) == 0: continue i_href = img[0].getAttribute('href') try: raw = zin.read(i_href) except KeyError: continue try: fmt, width, height = identify(bytes(raw)) except Exception: continue imgnum += 1 if opfmeta and frm.getAttribute('name').lower() == u'opf.cover': cover_href = i_href cover_data = (fmt, raw) cover_frame = frm.getAttribute('name') # could have upper case break if cover_href is None and imgnum == 1 and 0.8 <= height / width <= 1.8 and height * width >= 12000: # Pick the first image as the cover if it is of a suitable size cover_href = i_href cover_data = (fmt, raw) if not opfmeta: break if cover_href is not None: mi.cover = cover_href mi.odf_cover_frame = cover_frame if extract_cover: if not cover_data: raw = zin.read(cover_href) try: fmt = identify(bytes(raw))[0] except Exception: pass else: cover_data = (fmt, raw) mi.cover_data = cover_data
def read_image(self, href): if href not in self.images: item = self.oeb.manifest.hrefs.get(href) if item is None or not isinstance(item.data, bytes): return try: fmt, width, height = identify(item.data) except Exception: self.log.warning('Replacing corrupted image with blank: %s' % href) item.data = I('blank.png', data=True, allow_user_override=False) fmt, width, height = identify(item.data) image_fname = 'media/' + self.create_filename(href, fmt) image_rid = self.document_relationships.add_image(image_fname) self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) item.unload_data_from_memory() return self.images[href]
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list( filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def safe_img_data(container, name, mt): if 'svg' in mt: return 0, 0 try: fmt, width, height = identify(container.name_to_abspath(name)) except Exception: width = height = 0 return width, height
def __init__(self, path_or_bytes): if not isinstance(path_or_bytes, bytes): with open(path_or_bytes, 'rb') as f: path_or_bytes = f.read() self.img_data = path_or_bytes fmt, width, height = identify(path_or_bytes) self.img, self.fmt = image_and_format_from_data(path_or_bytes) self.width, self.height = self.img.width(), self.img.height() self.cache_key = self.img.cacheKey()
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def image_to_hexstring(self, data): # Images must be hex-encoded in 128 character lines data = save_cover_data_to(data) width, height = identify(data)[1:] lines = [] v = memoryview(data) for i in range(0, len(data), 64): lines.append(hexlify(v[i:i + 64])) hex_string = b'\n'.join(lines).decode('ascii') return hex_string, width, height
def workaround_ade_quirks(self, container, name): root = container.parsed(name) # ADE blows up floating images if their sizes are not specified for img in root.xpath('//*[local-name() = "img" and (@class = "float-right-img" or @class = "float-left-img")]'): if 'style' not in img.attrib: imgname = container.href_to_name(img.get('src'), name) fmt, width, height = identify(container.raw_data(imgname)) if width == -1: raise ValueError('Failed to read size of: %s' % imgname) img.set('style', 'width: %dpx; height: %dpx' % (width, height))
def inspect_cover(self, href): from calibre.ebooks.oeb.base import urlnormalize for x in self.oeb.manifest: if x.href == urlnormalize(href): try: raw = x.data return identify(raw)[1:] except Exception: self.log.exception('Failed to read cover image dimensions') return -1, -1
def load_image(self, data): self.is_valid = False try: fmt = identify(data)[0].encode('ascii') except Exception: fmt = b'' self.original_image_format = fmt.decode('ascii').lower() self.selection_state.reset() self.original_image_data = data self.current_image = i = self.original_image = ( QImage.fromData(data, format=fmt) if fmt else QImage.fromData(data)) self.is_valid = not i.isNull() self.update() self.image_changed.emit(self.current_image)
def load_image(self, data): self.is_valid = False try: fmt = identify(data)[0].encode('ascii') except Exception: fmt = b'' self.original_image_format = fmt.decode('ascii').lower() self.selection_state.reset() self.original_image_data = data self.current_image = i = self.original_image = (QImage.fromData( data, format=fmt) if fmt else QImage.fromData(data)) self.is_valid = not i.isNull() self.update() self.image_changed.emit(self.current_image)
def test(src, url, sz=None): raw = P(src, data=True) conn.request('GET', url) r = conn.getresponse() self.ae(r.status, httplib.OK) data = r.read() if sz is None: self.ae(data, raw) else: self.ae(sz, identify(data)[1]) test_response(r) conn.request('GET', url, headers={'If-None-Match':r.getheader('ETag')}) r = conn.getresponse() self.ae(r.status, httplib.NOT_MODIFIED) self.ae(b'', r.read())
def test(src, url, sz=None): raw = P(src, data=True) conn.request('GET', url) r = conn.getresponse() self.ae(r.status, http_client.OK) data = r.read() if sz is None: self.ae(data, raw) else: self.ae(sz, identify(data)[1]) test_response(r) conn.request('GET', url, headers={'If-None-Match':r.getheader('ETag')}) r = conn.getresponse() self.ae(r.status, http_client.NOT_MODIFIED) self.ae(b'', r.read())
def encode_thumbnail(thumbnail): ''' Encode the image part of a thumbnail, then return the 3 part tuple ''' from calibre.utils.imghdr import identify if thumbnail is None: return None if not isinstance(thumbnail, (tuple, list)): try: width, height = identify(as_bytes(thumbnail))[1:] if width < 0 or height < 0: return None thumbnail = (width, height, thumbnail) except Exception: return None return (thumbnail[0], thumbnail[1], as_base64_unicode(thumbnail[2]))
def get_cover(docx): doc = docx.document get = docx.namespace.get images = docx.namespace.XPath( '//*[name()="w:drawing" or name()="w:pict"]/descendant::*[(name()="a:blip" and @r:embed) or (name()="v:imagedata" and @r:id)][1]') rid_map = docx.document_relationships[0] for image in images(doc): rid = get(image, 'r:embed') or get(image, 'r:id') if rid in rid_map: try: raw = docx.read(rid_map[rid]) fmt, width, height = identify(bytes(raw)) except Exception: continue if width < 0 or height < 0: continue if 0.8 <= height/width <= 1.8 and height*width >= 160000: return (fmt, raw)
def process_result(log, result): plugin, data = result try: if getattr(plugin, 'auto_trim_covers', False): img = image_from_data(data) nimg = remove_borders_from_image(img) if nimg is not img: data = image_to_data(nimg) fmt, width, height = identify(data) if width < 0 or height < 0: raise ValueError('Could not read cover image dimensions') if width < 50 or height < 50: raise ValueError('Image too small') data = save_cover_data_to(data) except Exception: log.exception('Invalid cover from', plugin.name) return None return (plugin, width, height, fmt, data)
def _parse_cover_data(root, imgid, mi, ctx): from calibre.ebooks.fb2 import base64_decode elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') mime_extensions = guess_all_extensions(mimetype) if not mime_extensions and mimetype.startswith('image/'): mimetype_fromid = guess_type(imgid)[0] if mimetype_fromid and mimetype_fromid.startswith('image/'): mime_extensions = guess_all_extensions(mimetype_fromid) if mime_extensions: pic_data = elm_binary[0].text if pic_data: cdata = base64_decode(pic_data.strip()) fmt = identify(bytes(cdata))[0] mi.cover_data = (fmt, cdata) else: prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid))
def _parse_cover_data(root, imgid, mi, ctx): from calibre.ebooks.fb2 import base64_decode elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') mime_extensions = guess_all_extensions(mimetype) if not mime_extensions and mimetype.startswith('image/'): mimetype_fromid = guess_type(imgid)[0] if mimetype_fromid and mimetype_fromid.startswith('image/'): mime_extensions = guess_all_extensions(mimetype_fromid) if mime_extensions: pic_data = elm_binary[0].text if pic_data: cdata = base64_decode(pic_data.strip()) fmt = identify(cdata)[0] mi.cover_data = (fmt, cdata) else: prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid))
def image_to_hexstring(self, data): data = save_cover_data_to(data) width, height = identify(data)[1:] raw_hex = '' for char in data: raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0') # Images must be broken up so that they are no longer than 129 chars # per line hex_string = '' col = 1 for char in raw_hex: if col == 129: hex_string += '\n' col = 1 col += 1 hex_string += char return (hex_string, width, height)
def get_cover_data(stream, ext): # {{{ from calibre.ebooks.metadata.meta import get_metadata old = prefs['read_file_metadata'] if not old: prefs['read_file_metadata'] = True cdata = area = None try: with stream: mi = get_metadata(stream, ext) if mi.cover and os.access(mi.cover, os.R_OK): cdata = open(mi.cover).read() elif mi.cover_data[1] is not None: cdata = mi.cover_data[1] if cdata: fmt, width, height = identify(cdata) area = width*height except: cdata = area = None if old != prefs['read_file_metadata']: prefs['read_file_metadata'] = old return cdata, area
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = XHTML('a') else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start']) - 1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') istate.indent = style['text-indent'] if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = (u'\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + (u'\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + (u'\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int( round(float(value) / (72. / self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = float(width) / float(height) if 'width' not in istate.attrib: try: width = int(istate.attrib['height']) * ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width']) / ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get( 'width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = u'\u201c' + t t = elem.tail if not t: t = '' elem.tail = u'\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or (isinstance( valign, (float, int)) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or (isinstance(valign, (float, int)) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML('small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem) == 0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == u'\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML('br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath): # Move <hr>s outside paragraphs, if possible. pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) for hr in root.xpath('//span/hr'): p = pancestor(hr) if p: p = p[0] descendants = tuple(p.iterdescendants()) if descendants[-1] is hr: parent = p.getparent() idx = parent.index(p) parent.insert(idx+1, hr) hr.tail = '\n\t' # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span[not(@style or @lang)]'): if not current_run: current_run.append(span) else: last = current_run[-1] if mergeable(last, span): current_run.append(span) else: if len(current_run) > 1: merge_run(current_run) current_run = [span] # Remove unnecessary span tags that are the only child of a parent block # element class_map = dict(styles.classes.itervalues()) parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7)) for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)): if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None): # We have a block whose contents are entirely enclosed in a <span> span = parent[0] span_class = span.get('class', None) span_css = class_map.get(span_class, {}) if liftable(span_css): pclass = parent.get('class', None) if span_class: pclass = (pclass + ' ' + span_class) if pclass else span_class parent.set('class', pclass) parent.text = span.text parent.remove(span) if span.get('lang'): parent.set('lang', span.get('lang')) for child in span: parent.append(child) # Make spans whose only styling is bold or italic into <b> and <i> tags for span in root.xpath('//span[@class and not(@style)]'): css = class_map.get(span.get('class', None), {}) if len(css) == 1: if css == {'font-style':'italic'}: span.tag = 'i' del span.attrib['class'] elif css == {'font-weight':'bold'}: span.tag = 'b' del span.attrib['class'] # Get rid of <span>s that have no styling for span in root.xpath('//span[not(@class or @id or @style or @lang)]'): lift(span) # Convert <p><br style="page-break-after:always"> </p> style page breaks # into something the viewer will render as a page break for p in root.xpath('//p[br[@style="page-break-after:always"]]'): if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()): p.remove(p[0]) prefix = p.get('style', '') if prefix: prefix += '; ' p.set('style', prefix + 'page-break-after:always') p.text = NBSP if not p.text else p.text if detect_cover: # Check if the first image in the document is possibly a cover img = root.xpath('//img[@src][1]') if img: img = img[0] path = os.path.join(dest_dir, img.get('src')) if os.path.exists(path) and before_count(root, img, limit=10) < 5: from calibre.utils.imghdr import identify try: with lopen(path, 'rb') as imf: fmt, width, height = identify(imf) except: width, height, fmt = 0, 0, None # noqa del fmt try: is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000 except ZeroDivisionError: is_cover = False if is_cover: log.debug('Detected an image that looks like a cover') img.getparent().remove(img) return path
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath): # Move <hr>s outside paragraphs, if possible. pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) for hr in root.xpath('//span/hr'): p = pancestor(hr) if p: p = p[0] descendants = tuple(p.iterdescendants()) if descendants[-1] is hr: parent = p.getparent() idx = parent.index(p) parent.insert(idx + 1, hr) hr.tail = '\n\t' # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span[not(@style or @lang)]'): if not current_run: current_run.append(span) else: last = current_run[-1] if mergeable(last, span): current_run.append(span) else: if len(current_run) > 1: merge_run(current_run) current_run = [span] # Remove unnecessary span tags that are the only child of a parent block # element class_map = dict(styles.classes.itervalues()) parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7)) for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)): if len(parent) == 1 and not parent.text and not parent[ 0].tail and not parent[0].get('id', None): # We have a block whose contents are entirely enclosed in a <span> span = parent[0] span_class = span.get('class', None) span_css = class_map.get(span_class, {}) if liftable(span_css): pclass = parent.get('class', None) if span_class: pclass = (pclass + ' ' + span_class) if pclass else span_class parent.set('class', pclass) parent.text = span.text parent.remove(span) if span.get('lang'): parent.set('lang', span.get('lang')) for child in span: parent.append(child) # Make spans whose only styling is bold or italic into <b> and <i> tags for span in root.xpath('//span[@class and not(@style)]'): css = class_map.get(span.get('class', None), {}) if len(css) == 1: if css == {'font-style': 'italic'}: span.tag = 'i' del span.attrib['class'] elif css == {'font-weight': 'bold'}: span.tag = 'b' del span.attrib['class'] # Get rid of <span>s that have no styling for span in root.xpath('//span[not(@class or @id or @style or @lang)]'): lift(span) # Convert <p><br style="page-break-after:always"> </p> style page breaks # into something the viewer will render as a page break for p in root.xpath('//p[br[@style="page-break-after:always"]]'): if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()): p.remove(p[0]) prefix = p.get('style', '') if prefix: prefix += '; ' p.set('style', prefix + 'page-break-after:always') p.text = NBSP if not p.text else p.text if detect_cover: # Check if the first image in the document is possibly a cover img = root.xpath('//img[@src][1]') if img: img = img[0] path = os.path.join(dest_dir, img.get('src')) if os.path.exists(path) and before_count(root, img, limit=10) < 5: from calibre.utils.imghdr import identify try: with lopen(path, 'rb') as imf: fmt, width, height = identify(imf) except: width, height, fmt = 0, 0, None # noqa del fmt try: is_cover = 0.8 <= height / width <= 1.8 and height * width >= 160000 except ZeroDivisionError: is_cover = False if is_cover: log.debug('Detected an image that looks like a cover') img.getparent().remove(img) return path
def create_epub_cover(container, cover_path, existing_image, options=None): from calibre.ebooks.conversion.config import load_defaults from calibre.ebooks.oeb.transforms.cover import CoverManager try: ext = cover_path.rpartition(".")[-1].lower() except Exception: ext = "jpeg" cname, tname = "cover." + ext, "titlepage.xhtml" recommended_folders = get_recommended_folders(container, (cname, tname)) if existing_image: raster_cover = existing_image manifest_id = {v: k for k, v in container.manifest_id_map.iteritems()}[existing_image] raster_cover_item = container.opf_xpath('//opf:manifest/*[@id="%s"]' % manifest_id)[0] else: folder = recommended_folders[cname] if folder: cname = folder + "/" + cname raster_cover_item = container.generate_item(cname, id_prefix="cover") raster_cover = container.href_to_name(raster_cover_item.get("href"), container.opf_name) with container.open(raster_cover, "wb") as dest: if callable(cover_path): cover_path("write_image", dest) else: with lopen(cover_path, "rb") as src: shutil.copyfileobj(src, dest) if options is None: opts = load_defaults("epub_output") keep_aspect = opts.get("preserve_cover_aspect_ratio", False) no_svg = opts.get("no_svg_cover", False) else: keep_aspect = options.get("keep_aspect", False) no_svg = options.get("no_svg", False) if no_svg: style = 'style="height: 100%%"' templ = CoverManager.NONSVG_TEMPLATE.replace("__style__", style) else: if callable(cover_path): templ = (options or {}).get("template", CoverManager.SVG_TEMPLATE) else: width, height = 600, 800 try: if existing_image: width, height = identify(container.raw_data(existing_image, decode=False))[1:] else: with lopen(cover_path, "rb") as csrc: width, height = identify(csrc)[1:] except: container.log.exception("Failed to get width and height of cover") ar = "xMidYMid meet" if keep_aspect else "none" templ = CoverManager.SVG_TEMPLATE.replace("__ar__", ar) templ = templ.replace("__viewbox__", "0 0 %d %d" % (width, height)) templ = templ.replace("__width__", str(width)) templ = templ.replace("__height__", str(height)) folder = recommended_folders[tname] if folder: tname = folder + "/" + tname titlepage_item = container.generate_item(tname, id_prefix="titlepage") titlepage = container.href_to_name(titlepage_item.get("href"), container.opf_name) raw = templ % container.name_to_href(raster_cover, titlepage).encode("utf-8") with container.open(titlepage, "wb") as f: f.write(raw) # We have to make sure the raster cover item has id="cover" for the moron # that wrote the Nook firmware if raster_cover_item.get("id") != "cover": from calibre.ebooks.oeb.base import uuid_id newid = uuid_id() for item in container.opf_xpath('//*[@id="cover"]'): item.set("id", newid) for item in container.opf_xpath('//*[@idref="cover"]'): item.set("idref", newid) raster_cover_item.set("id", "cover") spine = container.opf_xpath("//opf:spine")[0] ref = spine.makeelement(OPF("itemref"), idref=titlepage_item.get("id")) container.insert_into_xml(spine, ref, index=0) guide = container.opf_get_or_create("guide") container.insert_into_xml( guide, guide.makeelement( OPF("reference"), type="cover", title=_("Cover"), href=container.name_to_href(titlepage, base=container.opf_name), ), ) metadata = container.opf_get_or_create("metadata") meta = metadata.makeelement(OPF("meta"), name="cover") meta.set("content", raster_cover_item.get("id")) container.insert_into_xml(metadata, meta) return raster_cover, titlepage
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data(lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb
def test_get(self): # {{{ 'Test /get' with self.create_server() as server: db = server.handler.router.ctx.library_broker.get(None) conn = server.connect() def get(what, book_id, library_id=None, q=''): q = ('?' + q) if q else q conn.request('GET', '/get/%s/%s' % (what, book_id) + (('/' + library_id) if library_id else '') + q) r = conn.getresponse() return r, r.read() # Test various invalid parameters def bad(*args): r, data = get(*args) self.ae(r.status, httplib.NOT_FOUND) bad('xxx', 1) bad('fmt1', 10) bad('fmt1', 1, 'zzzz') bad('fmt1', 'xx') # Test simple fetching of format without metadata update r, data = get('fmt1', 1, db.server_library_id) self.ae(data, db.format(1, 'fmt1')) self.assertIsNotNone(r.getheader('Content-Disposition')) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('fmt1', 1) self.ae(data, db.format(1, 'fmt1')) self.ae(r.getheader('Used-Cache'), 'yes') # Test fetching of format with metadata update raw = P('quick_start/eng.epub', data=True) r, data = get('epub', 1) self.ae(r.status, httplib.OK) etag = r.getheader('ETag') self.assertIsNotNone(etag) self.ae(r.getheader('Used-Cache'), 'no') self.assertTrue(data.startswith(b'PK')) self.assertGreaterEqual(len(data), len(raw)) db.set_field('title', {1:'changed'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed') r, data = get('epub', 1) self.ae(r.getheader('Used-Cache'), 'yes') # Test plugboards import calibre.library.save_to_disk as c orig, c.DEBUG = c.DEBUG, False try: db.set_pref('plugboards', {u'epub': {u'content_server': [[u'changed, {title}', u'title']]}}) # this is needed as the cache is not invalidated for plugboard changes db.set_field('title', {1:'again'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed, again') finally: c.DEBUG = orig # Test the serving of covers def change_cover(count, book_id=2): cpath = db.format_abspath(book_id, '__COVER_INTERNAL__') db.set_cover({2:I('lt.png', data=True)}) t = time.time() + 1 + count # Ensure mtime changes, needed on OS X where HFS+ has a 1s # mtime resolution os.utime(cpath, (t, t)) r, data = get('cover', 1) self.ae(r.status, httplib.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(r.getheader('Content-Type'), 'image/jpeg') r, data = get('cover', 1) self.ae(r.status, httplib.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('cover', 3) self.ae(r.status, httplib.OK) # Auto generated cover r, data = get('thumb', 1) self.ae(r.status, httplib.OK) self.ae(identify(data), ('jpeg', 60, 60)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1) self.ae(r.status, httplib.OK) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('thumb', 1, q='sz=100') self.ae(r.status, httplib.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1, q='sz=100x100') self.ae(r.status, httplib.OK) self.ae(r.getheader('Used-Cache'), 'yes') change_cover(1, 1) r, data = get('thumb', 1, q='sz=100') self.ae(r.status, httplib.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') # Test file sharing in cache r, data = get('cover', 2) self.ae(r.status, httplib.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = binascii.unhexlify(r.getheader('Tempfile')).decode('utf-8') f, fdata = share_open(path, 'rb'), data # Now force an update change_cover(1) r, data = get('cover', 2) self.ae(r.status, httplib.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = binascii.unhexlify(r.getheader('Tempfile')).decode('utf-8') f2, f2data = share_open(path, 'rb'), data # Do it again change_cover(2) r, data = get('cover', 2) self.ae(r.status, httplib.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(f.read(), fdata) self.ae(f2.read(), f2data) # Test serving of metadata as opf r, data = get('opf', 1) self.ae(r.status, httplib.OK) self.ae(r.getheader('Content-Type'), 'application/oebps-package+xml; charset=UTF-8') self.assertIsNotNone(r.getheader('Last-Modified')) opf = OPF(BytesIO(data), populate_spine=False, try_to_guess_cover=False) self.ae(db.field_for('title', 1), opf.title) self.ae(db.field_for('authors', 1), tuple(opf.authors)) conn.request('GET', '/get/opf/1', headers={'Accept-Encoding':'gzip'}) r = conn.getresponse() self.ae(r.status, httplib.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data) # Test serving metadata as json r, data = get('json', 1) self.ae(r.status, httplib.OK) self.ae(db.field_for('title', 1), json.loads(data)['title']) conn.request('GET', '/get/json/1', headers={'Accept-Encoding':'gzip'}) r = conn.getresponse() self.ae(r.status, httplib.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data)
def create_epub_cover(container, cover_path, existing_image, options=None): from calibre.ebooks.conversion.config import load_defaults from calibre.ebooks.oeb.transforms.cover import CoverManager try: ext = cover_path.rpartition('.')[-1].lower() except Exception: ext = 'jpeg' cname, tname = 'cover.' + ext, 'titlepage.xhtml' recommended_folders = get_recommended_folders(container, (cname, tname)) if existing_image: raster_cover = existing_image manifest_id = {v: k for k, v in iteritems(container.manifest_id_map) }[existing_image] raster_cover_item = container.opf_xpath('//opf:manifest/*[@id="%s"]' % manifest_id)[0] else: folder = recommended_folders[cname] if folder: cname = folder + '/' + cname raster_cover_item = container.generate_item(cname, id_prefix='cover') raster_cover = container.href_to_name(raster_cover_item.get('href'), container.opf_name) with container.open(raster_cover, 'wb') as dest: if callable(cover_path): cover_path('write_image', dest) else: with lopen(cover_path, 'rb') as src: shutil.copyfileobj(src, dest) if options is None: opts = load_defaults('epub_output') keep_aspect = opts.get('preserve_cover_aspect_ratio', False) no_svg = opts.get('no_svg_cover', False) else: keep_aspect = options.get('keep_aspect', False) no_svg = options.get('no_svg', False) if no_svg: style = 'style="height: 100%%"' templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style) has_svg = False else: if callable(cover_path): templ = (options or {}).get('template', CoverManager.SVG_TEMPLATE) has_svg = 'xlink:href' in templ else: width, height = 600, 800 has_svg = True try: if existing_image: width, height = identify( container.raw_data(existing_image, decode=False))[1:] else: with lopen(cover_path, 'rb') as csrc: width, height = identify(csrc)[1:] except: container.log.exception( "Failed to get width and height of cover") ar = 'xMidYMid meet' if keep_aspect else 'none' templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar) templ = templ.replace('__viewbox__', '0 0 %d %d' % (width, height)) templ = templ.replace('__width__', str(width)) templ = templ.replace('__height__', str(height)) folder = recommended_folders[tname] if folder: tname = folder + '/' + tname titlepage_item = container.generate_item(tname, id_prefix='titlepage') titlepage = container.href_to_name(titlepage_item.get('href'), container.opf_name) raw = templ % container.name_to_href(raster_cover, titlepage).encode('utf-8') with container.open(titlepage, 'wb') as f: f.write(raw) # We have to make sure the raster cover item has id="cover" for the moron # that wrote the Nook firmware if raster_cover_item.get('id') != 'cover': from calibre.ebooks.oeb.base import uuid_id newid = uuid_id() for item in container.opf_xpath('//*[@id="cover"]'): item.set('id', newid) for item in container.opf_xpath('//*[@idref="cover"]'): item.set('idref', newid) raster_cover_item.set('id', 'cover') spine = container.opf_xpath('//opf:spine')[0] ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id')) container.insert_into_xml(spine, ref, index=0) ver = container.opf_version_parsed if ver.major < 3: guide = container.opf_get_or_create('guide') container.insert_into_xml( guide, guide.makeelement(OPF('reference'), type='cover', title=_('Cover'), href=container.name_to_href( titlepage, base=container.opf_name))) metadata = container.opf_get_or_create('metadata') meta = metadata.makeelement(OPF('meta'), name='cover') meta.set('content', raster_cover_item.get('id')) container.insert_into_xml(metadata, meta) else: container.apply_unique_properties(raster_cover, 'cover-image') container.apply_unique_properties(titlepage, 'calibre:title-page') if has_svg: container.add_properties(titlepage, 'svg') return raster_cover, titlepage
def create_epub_cover(container, cover_path, existing_image, options=None): from calibre.ebooks.conversion.config import load_defaults from calibre.ebooks.oeb.transforms.cover import CoverManager try: ext = cover_path.rpartition('.')[-1].lower() except Exception: ext = 'jpeg' cname, tname = 'cover.' + ext, 'titlepage.xhtml' recommended_folders = get_recommended_folders(container, (cname, tname)) if existing_image: raster_cover = existing_image manifest_id = {v:k for k, v in container.manifest_id_map.iteritems()}[existing_image] raster_cover_item = container.opf_xpath('//opf:manifest/*[@id="%s"]' % manifest_id)[0] else: folder = recommended_folders[cname] if folder: cname = folder + '/' + cname raster_cover_item = container.generate_item(cname, id_prefix='cover') raster_cover = container.href_to_name(raster_cover_item.get('href'), container.opf_name) with container.open(raster_cover, 'wb') as dest: if callable(cover_path): cover_path('write_image', dest) else: with lopen(cover_path, 'rb') as src: shutil.copyfileobj(src, dest) if options is None: opts = load_defaults('epub_output') keep_aspect = opts.get('preserve_cover_aspect_ratio', False) no_svg = opts.get('no_svg_cover', False) else: keep_aspect = options.get('keep_aspect', False) no_svg = options.get('no_svg', False) if no_svg: style = 'style="height: 100%%"' templ = CoverManager.NONSVG_TEMPLATE.replace('__style__', style) has_svg = False else: if callable(cover_path): templ = (options or {}).get('template', CoverManager.SVG_TEMPLATE) has_svg = 'xlink:href' in templ else: width, height = 600, 800 has_svg = True try: if existing_image: width, height = identify(container.raw_data(existing_image, decode=False))[1:] else: with lopen(cover_path, 'rb') as csrc: width, height = identify(csrc)[1:] except: container.log.exception("Failed to get width and height of cover") ar = 'xMidYMid meet' if keep_aspect else 'none' templ = CoverManager.SVG_TEMPLATE.replace('__ar__', ar) templ = templ.replace('__viewbox__', '0 0 %d %d'%(width, height)) templ = templ.replace('__width__', str(width)) templ = templ.replace('__height__', str(height)) folder = recommended_folders[tname] if folder: tname = folder + '/' + tname titlepage_item = container.generate_item(tname, id_prefix='titlepage') titlepage = container.href_to_name(titlepage_item.get('href'), container.opf_name) raw = templ%container.name_to_href(raster_cover, titlepage).encode('utf-8') with container.open(titlepage, 'wb') as f: f.write(raw) # We have to make sure the raster cover item has id="cover" for the moron # that wrote the Nook firmware if raster_cover_item.get('id') != 'cover': from calibre.ebooks.oeb.base import uuid_id newid = uuid_id() for item in container.opf_xpath('//*[@id="cover"]'): item.set('id', newid) for item in container.opf_xpath('//*[@idref="cover"]'): item.set('idref', newid) raster_cover_item.set('id', 'cover') spine = container.opf_xpath('//opf:spine')[0] ref = spine.makeelement(OPF('itemref'), idref=titlepage_item.get('id')) container.insert_into_xml(spine, ref, index=0) ver = container.opf_version_parsed if ver.major < 3: guide = container.opf_get_or_create('guide') container.insert_into_xml(guide, guide.makeelement( OPF('reference'), type='cover', title=_('Cover'), href=container.name_to_href(titlepage, base=container.opf_name))) metadata = container.opf_get_or_create('metadata') meta = metadata.makeelement(OPF('meta'), name='cover') meta.set('content', raster_cover_item.get('id')) container.insert_into_xml(metadata, meta) else: container.apply_unique_properties(raster_cover, 'cover-image') container.apply_unique_properties(titlepage, 'calibre:title-page') if has_svg: container.add_properties(titlepage, 'svg') return raster_cover, titlepage
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = XHTML('a') else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start'])-1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') istate.indent = style['text-indent'] if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = (u'\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + (u'\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + (u'\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / (72./self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = float(width)/float(height) if 'width' not in istate.attrib: try: width = int(istate.attrib['height'])*ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width'])/ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = u'\u201c' + t t = elem.tail if not t: t = '' elem.tail = u'\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or ( isinstance(valign, (float, int)) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or ( isinstance(valign, (float, int)) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML('small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem)==0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == u'\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML('br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def test_get(self): # {{{ 'Test /get' with self.create_server() as server: db = server.handler.router.ctx.library_broker.get(None) conn = server.connect() def get(what, book_id, library_id=None, q=''): q = ('?' + q) if q else q conn.request( 'GET', '/get/%s/%s' % (what, book_id) + (('/' + library_id) if library_id else '') + q) r = conn.getresponse() return r, r.read() # Test various invalid parameters def bad(*args): r, data = get(*args) self.ae(r.status, http_client.NOT_FOUND) bad('xxx', 1) bad('fmt1', 10) bad('fmt1', 1, 'zzzz') bad('fmt1', 'xx') # Test simple fetching of format without metadata update r, data = get('fmt1', 1, db.server_library_id) self.ae(data, db.format(1, 'fmt1')) self.assertIsNotNone(r.getheader('Content-Disposition')) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('fmt1', 1) self.ae(data, db.format(1, 'fmt1')) self.ae(r.getheader('Used-Cache'), 'yes') # Test fetching of format with metadata update raw = P('quick_start/eng.epub', data=True) r, data = get('epub', 1) self.ae(r.status, http_client.OK) etag = r.getheader('ETag') self.assertIsNotNone(etag) self.ae(r.getheader('Used-Cache'), 'no') self.assertTrue(data.startswith(b'PK')) self.assertGreaterEqual(len(data), len(raw)) db.set_field('title', {1: 'changed'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed') r, data = get('epub', 1) self.ae(r.getheader('Used-Cache'), 'yes') # Test plugboards import calibre.library.save_to_disk as c orig, c.DEBUG = c.DEBUG, False try: db.set_pref( 'plugboards', { u'epub': { u'content_server': [[u'changed, {title}', u'title']] } }) # this is needed as the cache is not invalidated for plugboard changes db.set_field('title', {1: 'again'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed, again') finally: c.DEBUG = orig # Test the serving of covers def change_cover(count, book_id=2): cpath = db.format_abspath(book_id, '__COVER_INTERNAL__') db.set_cover({2: I('lt.png', data=True)}) t = time.time() + 1 + count # Ensure mtime changes, needed on OS X where HFS+ has a 1s # mtime resolution os.utime(cpath, (t, t)) r, data = get('cover', 1) self.ae(r.status, http_client.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(r.getheader('Content-Type'), 'image/jpeg') r, data = get('cover', 1) self.ae(r.status, http_client.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('cover', 3) self.ae(r.status, http_client.OK) # Auto generated cover r, data = get('thumb', 1) self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 60, 60)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1) self.ae(r.status, http_client.OK) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('thumb', 1, q='sz=100') self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1, q='sz=100x100') self.ae(r.status, http_client.OK) self.ae(r.getheader('Used-Cache'), 'yes') change_cover(1, 1) r, data = get('thumb', 1, q='sz=100') self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') # Test file sharing in cache r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = from_hex_unicode(r.getheader('Tempfile')) f, fdata = share_open(path, 'rb'), data # Now force an update change_cover(1) r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = from_hex_unicode(r.getheader('Tempfile')) f2, f2data = share_open(path, 'rb'), data # Do it again change_cover(2) r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(f.read(), fdata) self.ae(f2.read(), f2data) # Test serving of metadata as opf r, data = get('opf', 1) self.ae(r.status, http_client.OK) self.ae(r.getheader('Content-Type'), 'application/oebps-package+xml; charset=UTF-8') self.assertIsNotNone(r.getheader('Last-Modified')) opf = OPF(BytesIO(data), populate_spine=False, try_to_guess_cover=False) self.ae(db.field_for('title', 1), opf.title) self.ae(db.field_for('authors', 1), tuple(opf.authors)) conn.request('GET', '/get/opf/1', headers={'Accept-Encoding': 'gzip'}) r = conn.getresponse() self.ae(r.status, http_client.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16 + zlib.MAX_WBITS), data) # Test serving metadata as json r, data = get('json', 1) self.ae(r.status, http_client.OK) self.ae(db.field_for('title', 1), json.loads(data)['title']) conn.request('GET', '/get/json/1', headers={'Accept-Encoding': 'gzip'}) r = conn.getresponse() self.ae(r.status, http_client.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16 + zlib.MAX_WBITS), data)
def identify(self, data): fmt, width, height = identify(data) return width, height, fmt
def get_image_size(image_path): fmt, width, height = identify(open(image_path, 'rb')) return width, height
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml( section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml( d, section_data.header.paragraph_offsets).decode( self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug( 'Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error( 'Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data( lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug( 'Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb