def read_cover(stream, zin, mi, opfmeta, extract_cover): # search for an draw:image in a draw:frame with the name 'opf.cover' # if opf.metadata prop is false, just use the first image that # has a proper size (borrowed from docx) otext = odLoad(stream) cover_href = None cover_data = None cover_frame = None imgnum = 0 for frm in otext.topnode.getElementsByType(odFrame): img = frm.getElementsByType(odImage) if len(img) == 0: continue i_href = img[0].getAttribute('href') try: raw = zin.read(i_href) except KeyError: continue try: fmt, width, height = identify(raw) except Exception: continue imgnum += 1 if opfmeta and frm.getAttribute('name').lower() == 'opf.cover': cover_href = i_href cover_data = (fmt, raw) cover_frame = frm.getAttribute('name') # could have upper case break if cover_href is None and imgnum == 1 and 0.8 <= height / width <= 1.8 and height * width >= 12000: # Pick the first image as the cover if it is of a suitable size cover_href = i_href cover_data = (fmt, raw) if not opfmeta: break if cover_href is not None: mi.cover = cover_href mi.odf_cover_frame = cover_frame if extract_cover: if not cover_data: raw = zin.read(cover_href) try: fmt = identify(raw)[0] except Exception: pass else: cover_data = (fmt, raw) mi.cover_data = cover_data
def inspect_cover(self, href): from ebook_converter.ebooks.oeb.base import urlnormalize for x in self.oeb.manifest: if x.href == urlnormalize(href): try: raw = x.data return identify(raw)[1:] except Exception: self.log.exception('Failed to read cover image dimensions') return -1, -1
def read_image(self, href): if href not in self.images: item = self.oeb.manifest.hrefs.get(href) if item is None or not isinstance(item.data, bytes): return try: fmt, width, height = identify(item.data) except Exception: self.log.warning('Replacing corrupted image with blank: %s' % href) item.data = I('blank.png', data=True, allow_user_override=False) fmt, width, height = identify(item.data) image_fname = 'media/' + self.create_filename(href, fmt) image_rid = self.document_relationships.add_image(image_fname) self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) item.unload_data_from_memory() return self.images[href]
def _parse_cover_data(root, imgid, mi, ctx): from ebook_converter.ebooks.fb2 import base64_decode elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') mime_extensions = mimetypes.guess_all_extensions(mimetype) if not mime_extensions and mimetype.startswith('image/'): mimetype_fromid = mimetypes.guess_type(imgid)[0] if mimetype_fromid and mimetype_fromid.startswith('image/'): mime_extensions = (mimetypes .guess_all_extensions(mimetype_fromid)) if mime_extensions: pic_data = elm_binary[0].text if pic_data: cdata = base64_decode(pic_data.strip()) fmt = identify(cdata)[0] mi.cover_data = (fmt, cdata) else: print(f"WARNING: Unsupported coverpage mime-type '{mimetype}' " f"(id=#{imgid})")
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) != const.XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = base.tag('xhtml', 'a') else: return tag = parse_utils.barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start'])-1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') istate.indent = style['text-indent'] if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = ('\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + ('\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + ('\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / (72/self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[base.urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = width / height if 'width' not in istate.attrib: try: width = int(istate.attrib['height'])*ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width'])/ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = '\u201c' + t t = elem.tail if not t: t = '' elem.tail = '\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or ( isinstance(valign, numbers.Number) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or ( isinstance(valign, numbers.Number) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body'))) vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, base.tag('xhtml', vtag)) vtag = etree.SubElement(vtag, base.tag('xhtml', 'small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem)==0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == '\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath): # Apply vertical-align for span in root.xpath('//span[@data-docx-vert]'): wrap_contents(span.attrib.pop('data-docx-vert'), span) # Move <hr>s outside paragraphs, if possible. pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) for hr in root.xpath('//span/hr'): p = pancestor(hr) if p: p = p[0] descendants = tuple(p.iterdescendants()) if descendants[-1] is hr: parent = p.getparent() idx = parent.index(p) parent.insert(idx+1, hr) hr.tail = '\n\t' # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span'): if not current_run: current_run.append(span) else: last = current_run[-1] if mergeable(last, span): current_run.append(span) else: if len(current_run) > 1: merge_run(current_run) current_run = [span] # Process dir attributes class_map = dict(styles.classes.values()) parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7)) for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)): # Ensure that children of rtl parents that are not rtl have an # explicit dir set. Also, remove dir from children if it is the same as # that of the parent. if len(parent): parent_dir = parent.get('dir') for child in parent.iterchildren('span'): child_dir = child.get('dir') if parent_dir == 'rtl' and child_dir != 'rtl': child_dir = 'ltr' child.set('dir', child_dir) if child_dir and child_dir == parent_dir: child.attrib.pop('dir') # Remove unnecessary span tags that are the only child of a parent block # element for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)): if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None): # We have a block whose contents are entirely enclosed in a <span> span = parent[0] span_class = span.get('class', None) span_css = class_map.get(span_class, {}) span_dir = span.get('dir') if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')): pclass = parent.get('class', None) if span_class: pclass = (pclass + ' ' + span_class) if pclass else span_class parent.set('class', pclass) parent.text = span.text parent.remove(span) if span.get('lang'): parent.set('lang', span.get('lang')) if span.get('dir'): parent.set('dir', span.get('dir')) for child in span: parent.append(child) # Make spans whose only styling is bold or italic into <b> and <i> tags for span in root.xpath('//span[@class and not(@style)]'): css = class_map.get(span.get('class', None), {}) if len(css) == 1: if css == {'font-style':'italic'}: span.tag = 'i' del span.attrib['class'] elif css == {'font-weight':'bold'}: span.tag = 'b' del span.attrib['class'] # Get rid of <span>s that have no styling for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'): lift(span) # Convert <p><br style="page-break-after:always"> </p> style page breaks # into something the viewer will render as a page break for p in root.xpath('//p[br[@style="page-break-after:always"]]'): if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()): p.remove(p[0]) prefix = p.get('style', '') if prefix: prefix += '; ' p.set('style', prefix + 'page-break-after:always') p.text = NBSP if not p.text else p.text if detect_cover: # Check if the first image in the document is possibly a cover img = root.xpath('//img[@src][1]') if img: img = img[0] path = os.path.join(dest_dir, img.get('src')) if os.path.exists(path) and before_count(root, img, limit=10) < 5: from ebook_converter.utils.imghdr import identify try: with open(path, 'rb') as imf: fmt, width, height = identify(imf) except: width, height, fmt = 0, 0, None # noqa del fmt try: is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000 except ZeroDivisionError: is_cover = False if is_cover: log.debug('Detected an image that looks like a cover') img.getparent().remove(img) return path
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(open('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data(open('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with open('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb