def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): elem.tag = OPF(barename(elem.tag)) nsmap.update(OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(OPF('package'), nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) ignored = (OPF('dc-metadata'), OPF('x-metadata')) for elem in xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if namespace(elem.tag) in DC_NSES: tag = barename(elem.tag).lower() elem.tag = '{%s}%s' % (DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (DC11_NS, tag) metadata.append(elem) for element in xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in xpath(opf, tag): nroot.append(element) return nroot
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] tags = [] tag = barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Remove attributes we won't want. if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def serialize_elem(self, elem, item, nsrmap=NSRMAP): buf = self.buf if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) not in nsrmap: return tag = prefixname(elem.tag, nsrmap) # Previous layers take care of @name id_ = elem.attrib.pop('id', None) if id_: href = '#'.join((item.href, id_)) offset = self.anchor_offset or buf.tell() key = urlnormalize(href) # Only set this id_offset if it wasn't previously seen self.id_offsets[key] = self.id_offsets.get(key, offset) if self.anchor_offset is not None and \ tag == 'a' and not elem.attrib and \ not len(elem) and not elem.text: return self.anchor_offset = buf.tell() buf.write(b'<') buf.write(tag.encode('utf-8')) if elem.attrib: for attr, val in elem.attrib.items(): if namespace(attr) not in nsrmap: continue attr = prefixname(attr, nsrmap) buf.write(b' ') if attr == 'href': if self.serialize_href(val, item): continue elif attr == 'src': href = urlnormalize(item.abshref(val)) if href in self.images: index = self.images[href] self.used_images.add(href) buf.write(b'recindex="%05d"' % index) continue buf.write(attr.encode('utf-8')) buf.write(b'="') self.serialize_text(val, quot=True) buf.write(b'"') buf.write(b'>') if elem.text or len(elem) > 0: if elem.text: self.anchor_offset = None self.serialize_text(elem.text) for child in elem: self.serialize_elem(child, item) if child.tail: self.anchor_offset = None self.serialize_text(child.tail) buf.write(b'</%s>' % tag.encode('utf-8'))
def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace('<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') except etree.XMLSyntaxError: from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER opf = etree.fromstring(data, parser=RECOVER_PARSER) self.logger.warn('OPF contains invalid markup, trying to parse it anyway') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf
def mangle_elem(self, elem, stylizer): if not isinstance(elem.tag, basestring) or \ namespace(elem.tag) != XHTML_NS: return children = list(elem) style = stylizer.style(elem) transform = style['text-transform'] variant = style['font-variant'] if elem.text: if transform in TEXT_TRANSFORMS: elem.text = self.text_transform(transform, elem.text) if variant == 'small-caps': self.smallcaps_elem(elem, 'text') for child in children: self.mangle_elem(child, stylizer) if child.tail: if transform in TEXT_TRANSFORMS: child.tail = self.text_transform(transform, child.tail) if variant == 'small-caps': self.smallcaps_elem(child, 'tail')
def dump_text(self, subitems, elem, stylizer, end='', pre=False, li=''): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if elem.attrib.get('id') is not None and elem.attrib['id'] in [href for href, title in subitems]: if self.curSubItem is not None and self.curSubItem != elem.attrib['id']: self.curSubItem = elem.attrib['id'] text.append(u'\n\n%s%s\n\n' % (CALIBRE_SNB_BM_TAG, self.curSubItem)) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) in_block = False # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text: text.append(u'\n\n') if tag in SPACE_TAGS: if not end.endswith('u ') and hasattr(elem, 'text') and elem.text: text.append(u' ') if tag == 'img': text.append(u'\n\n%s%s\n\n' % (CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src']))) if tag == 'br': text.append(u'\n\n') if tag == 'li': li = '- ' pre = (tag == 'pre' or pre) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: if pre: text.append((u'\n\n%s' % CALIBRE_SNB_PRE_TAG).join((li + elem.text).splitlines())) else: text.append(li + elem.text) li = '' for item in elem: en = u'' if len(text) >= 2: en = text[-1][-2:] t = self.dump_text(subitems, item, stylizer, en, pre, li)[0] text += t if in_block: text.append(u'\n\n') if hasattr(elem, 'tail') and elem.tail: if pre: text.append((u'\n\n%s' % CALIBRE_SNB_PRE_TAG).join(elem.tail.splitlines())) else: text.append(li + elem.tail) li = '' return text, li
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = int(round(float(style.marginTop) / style.fontSize) - 1) if ems >= 1: text.append(u'\n\n' * ems) bq = '> ' * self.blockquotes # Block level elements if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): h_tag = '' if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): h_tag = '#' * int(tag[1]) + ' ' text.append('\n' + bq + h_tag) tags.append('\n') self.remove_space_after_newline = True if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: # noqa text.append('*') tags.append('*') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: # noqa text.append('**') tags.append('**') self.style_bold = True if tag == 'br': text.append(' \n') self.remove_space_after_newline = True if tag == 'blockquote': self.blockquotes += 1 tags.append('>') text.append('> ' * self.blockquotes) elif tag == 'code': if not self.in_pre and not self.in_code: text.append('`') tags.append('`') self.in_code = True elif tag == 'pre': if not self.in_pre: text.append('\n') tags.append('pre') self.in_pre = True elif tag == 'hr': text.append('\n* * *') tags.append('\n') elif tag == 'a': # Only write links with absolute (external) urls. if self.opts.keep_links and attribs.has_key( 'href') and '://' in attribs['href']: # noqa title = '' if attribs.has_key('title'): # noqa title = ' "' + attribs['title'] + '"' remove_space = self.remove_space_after_newline title = self.remove_newlines(title) self.remove_space_after_newline = remove_space text.append('[') tags.append('](' + attribs['href'] + title + ')') elif tag == 'img': if self.opts.keep_image_references: txt = '!' if attribs.has_key('alt'): # noqa remove_space = self.remove_space_after_newline txt += '[' + self.remove_newlines(attribs['alt']) + ']' self.remove_space_after_newline = remove_space txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): tags.append(tag) # Add the list to our lists of lists so we can track # nested lists. self.list.append({'name': tag, 'num': 0}) elif tag == 'li': # Get the last list from our list of lists if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} # Add a new line to start the item text.append('\n') # Add indent if we have nested lists. list_count = len(self.list) # We only care about indenting nested lists. if (list_count - 1) > 0: text.append('\t' * (list_count - 1)) # Add blockquote if we have a blockquote in a list item. text.append(bq) # Write the proper sign for ordered and unorded lists. if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': li['num'] += 1 text.append(unicode(li['num']) + '. ') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if self.in_pre: txt = self.prepare_string_for_pre(txt) elif self.in_code: txt = self.remove_newlines(txt) else: txt = self.prepare_string_for_markdown( self.remove_newlines(txt)) text.append(txt) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', '>'): if t == 'pre': self.in_pre = False text.append('\n') elif t == '>': self.blockquotes -= 1 elif t in ('ul', 'ol'): if self.list: self.list.pop() text.append('\n') else: if t == '**': self.style_bold = False elif t == '*': self.style_italic = False elif t == '`': self.in_code = False text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict( ) and style['margin-bottom'] != 'auto': ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) if ems >= 1: text.append(u'\n\n' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if self.in_pre: tail = self.prepare_string_for_pre(tail) elif self.in_code: tail = self.remove_newlines(tail) else: tail = self.prepare_string_for_markdown( self.remove_newlines(tail)) text.append(tail) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib style_a = '%s' % style style_a = style_a if style_a else '' if tag == 'body': # Change the body to a div so we can merge multiple files. tag = 'div' # Add page-break-brefore: always because renders typically treat a new file (we're merging files) # as a page break and remove all other page break types that might be set. style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a) # Remove unnecessary spaces. style_a = re.sub(r'\s{2,}', ' ', style_a).strip() tags.append(tag) # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Turn style into strings for putting in the tag. style_t = '' if style_a: style_t = ' style="%s"' % style_a.replace('"', "'") # Write the tag. text.append('<%s%s%s' % (tag, at, style_t)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return [] text = [] tags = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [] tag = barename(elem.tag) # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: tags.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument. if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' else: self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) elif tag == 'hr': w = '\\w' width = elem.get('width') if width: if not width.endswith('%'): width += '%' w += '="%s"' % width else: w += '="50%"' text.append(w) elif tag == 'br': text.append('\n\\c \n\\c\n') # TOC markers. toc_name = elem.attrib.get('name', None) toc_id = elem.attrib.get('id', None) # Only write the TOC marker if the tag isn't a heading and we aren't in one. if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \ 'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \ 'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \ 'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags: toc_page = page.href if self.toc.get(toc_page, None): for toc_x in (toc_name, toc_id): toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0)) if toc_title: toc_depth = max(min(toc_depth, 4), 0) text.append('\\C%s="%s"' % (toc_depth, toc_title)) # Process style information that needs holds a single tag. # Commented out because every page in an OEB book starts with this style. if style['page-break-before'] == 'always': text.append('\\p') # Process basic PML tags. pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack+tags: text.append('\\%s' % pml_tag) tags.append(pml_tag) # Special processing of tags that require an argument. # Anchors links if tag in LINK_TAGS and 'q' not in tag_stack+tags: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = '#%s' % self.link_hrefs[href] text.append('\\q="%s"' % href) tags.append('q') # Anchor ids id_name = elem.get('id') name_name = elem.get('name') for name_x in (id_name, name_name): if name_x: text.append(self.get_anchor(page, name_x)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack+tags: text.append('\\%s' % style_tag) tags.append(style_tag) # margin left try: mms = int(float(style['margin-left']) * 100 / style.height) if mms: text.append('\\T="%s%%"' % mms) except: pass # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n\\c \n\\c\n') except: pass # Proccess text within this tag. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_pml(elem.text)) # Process inner tags for item in elem: text += self.dump_text(item, stylizer, page, tag_stack+tags) # Close opened tags. tags.reverse() text += self.close_tags(tags) #if tag in SEPARATE_TAGS: # text.append('\n\n') if style['page-break-after'] == 'always': text.append('\\p') # Process text after this tag but not within another. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_pml(elem.tail)) return text
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = int(round(float(style.marginTop) / style.fontSize) - 1) if ems >= 1: text.append(u'\n\n' * ems) bq = '> ' * self.blockquotes # Block level elements if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): h_tag = '' if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): h_tag = '#' * int(tag[1]) + ' ' text.append('\n' + bq + h_tag) tags.append('\n') self.remove_space_after_newline = True if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: text.append('*') tags.append('*') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: text.append('**') tags.append('**') self.style_bold = True if tag == 'br': text.append(' \n') self.remove_space_after_newline = True if tag == 'blockquote': self.blockquotes += 1 tags.append('>') text.append('> ' * self.blockquotes) elif tag == 'code': if not self.in_pre and not self.in_code: text.append('`') tags.append('`') self.in_code = True elif tag == 'pre': if not self.in_pre: text.append('\n') tags.append('pre') self.in_pre = True elif tag == 'hr': text.append('\n* * *') tags.append('\n') elif tag == 'a': # Only write links with absolute (external) urls. if self.opts.keep_links and attribs.has_key('href') and '://' in attribs['href']: title = '' if attribs.has_key('title'): title = ' "' + attribs['title'] + '"' remove_space = self.remove_space_after_newline title = self.remove_newlines(title) self.remove_space_after_newline = remove_space text.append('[') tags.append('](' + attribs['href'] + title + ')') elif tag == 'img': if self.opts.keep_image_references: txt = '!' if attribs.has_key('alt'): remove_space = self.remove_space_after_newline txt += '[' + self.remove_newlines(attribs['alt']) + ']' self.remove_space_after_newline = remove_space txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): tags.append(tag) # Add the list to our lists of lists so we can track # nested lists. self.list.append({'name': tag, 'num': 0}) elif tag == 'li': # Get the last list from our list of lists if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} # Add a new line to start the item text.append('\n') # Add indent if we have nested lists. list_count = len(self.list) # We only care about indenting nested lists. if (list_count - 1) > 0: text.append('\t' * (list_count - 1)) # Add blockquote if we have a blockquote in a list item. text.append(bq) # Write the proper sign for ordered and unorded lists. if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': li['num'] += 1 text.append(unicode(li['num']) + '. ') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if self.in_pre: txt = self.prepare_string_for_pre(txt) elif self.in_code: txt = self.remove_newlines(txt) else: txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) text.append(txt) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', '>'): if t == 'pre': self.in_pre = False text.append('\n') elif t == '>': self.blockquotes -= 1 elif t in ('ul', 'ol'): if self.list: self.list.pop() text.append('\n') else: if t == '**': self.style_bold = False elif t == '*': self.style_italic = False elif t == '`': self.in_code = False text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) if ems >= 1: text.append(u'\n\n' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if self.in_pre: tail = self.prepare_string_for_pre(tail) elif self.in_code: tail = self.remove_newlines(tail) else: tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) text.append(tail) return text
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = XHTML('a') else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start'])-1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] istate.indent = style['text-indent'] if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = (u'\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + (u'\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + (u'\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = (style['white-space'] in ('pre', 'pre-wrap')) istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if style['font-family'] else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / \ (72./self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify_data(item.data)[:2] except: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = float(width)/float(height) if 'width' not in istate.attrib: try: width = int(istate.attrib['height'])*ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width'])/ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = u'\u201c' + t t = elem.tail if not t: t = '' elem.tail = u'\u201d' + t text = None if elem.text: if istate.preserve: text = elem.text elif len(elem) > 0 and isspace(elem.text): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or ( isinstance(valign, (float, int)) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or ( isinstance(valign, (float, int)) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML('small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem)==0)): self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == u'\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML('br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def dump_text(self, elem, stylizer, tag_stack=[]): from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename, urlnormalize) if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return elem.tail return '' text = '' style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return elem.tail return '' tag = barename(elem.tag) tag_count = 0 # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == 'img': src = elem.get('src') if src: src = urlnormalize(self.currently_dumping_item.abshref(src)) block_start = '' block_end = '' if 'block' not in tag_stack: block_start = r'{\par\pard\hyphpar ' block_end = '}' text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) single_tag = SINGLE_TAGS.get(tag, None) if single_tag: text += single_tag rtf_tag = TAGS.get(tag, None) if rtf_tag and rtf_tag not in tag_stack: tag_count += 1 text += '{%s\n' % rtf_tag tag_stack.append(rtf_tag) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack: tag_count += 1 text += '{%s\n' % style_tag tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text: text += txt2rtf(elem.text) for item in elem: text += self.dump_text(item, stylizer, tag_stack) for i in range(0, tag_count): end_tag = tag_stack.pop() if end_tag != 'block': if tag in BLOCK_TAGS: text += r'\par\pard\plain\hyphpar}' else: text += '}' if hasattr(elem, 'tail') and elem.tail: if 'block' in tag_stack: text += '%s' % txt2rtf(elem.tail) else: text += r'{\par\pard\hyphpar %s}' % txt2rtf(elem.tail) return text
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if style["display"] in ("none", "oeb-page-head", "oeb-page-foot") or style["visibility"] == "hidden": id_ = elem.get("id", None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set("id", id_) elem.tail = tail elem.tag = XHTML("a") else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == "ol" and "start" in elem.attrib: try: istate.list_num = int(elem.attrib["start"]) - 1 except: pass istates.append(istate) left = 0 display = style["display"] if display == "table-cell": display = "inline" elif display.startswith("table"): display = "block" isblock = not display.startswith("inline") and style["display"] != "none" isblock = isblock and style["float"] == "none" isblock = isblock and tag != "br" if isblock: bstate.para = None istate.halign = style["text-align"] rawti = style._get("text-indent") istate.indent = style["text-indent"] if hasattr(rawti, "strip") and "%" in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style["margin-left"] == "auto" and style["margin-right"] == "auto": istate.halign = "center" margin = asfloat(style["margin-left"]) padding = asfloat(style["padding-left"]) if tag != "body": left = margin + padding istate.left += left vmargin = asfloat(style["margin-top"]) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style["padding-top"]) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style["margin-left"]) padding = asfloat(style["padding-left"]) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style["font-size"])) elem.text = (u"\xa0" * spaces) + (elem.text or "") margin = asfloat(style["margin-right"]) padding = asfloat(style["padding-right"]) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style["font-size"])) if len(elem) == 0: elem.text = (elem.text or "") + (u"\xa0" * spaces) else: last = elem[-1] last.text = (last.text or "") + (u"\xa0" * spaces) if bstate.content and style["page-break-before"] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style["font-size"]) istate.italic = True if style["font-style"] == "italic" else False weight = style["font-weight"] istate.bold = weight in ("bold", "bolder") or asfloat(weight) > 400 istate.preserve = style["white-space"] == "pre" istate.pre_wrap = style["white-space"] == "pre-wrap" istate.bgcolor = style["background-color"] istate.fgcolor = style["color"] istate.strikethrough = style.effective_text_decoration == "line-through" istate.underline = style.effective_text_decoration == "underline" ff = style["font-family"].lower() if hasattr(style["font-family"], "lower") else "" if "monospace" in ff or "courier" in ff or ff.endswith(" mono"): istate.family = "monospace" elif "sans-serif" in ff or "sansserif" in ff or "verdana" in ff or "arial" in ff or "helvetica" in ff: istate.family = "sans-serif" else: istate.family = "serif" if "id" in elem.attrib: istate.ids.add(elem.attrib["id"]) if "name" in elem.attrib: istate.ids.add(elem.attrib["name"]) if tag == "a" and "href" in elem.attrib: istate.href = elem.attrib["href"] istate.attrib.clear() if tag == "img" and "src" in elem.attrib: istate.attrib["src"] = elem.attrib["src"] istate.attrib["align"] = "baseline" cssdict = style.cssdict() valign = cssdict.get("vertical-align", None) if valign in ("top", "bottom", "middle"): istate.attrib["align"] = valign for prop in ("width", "height"): if cssdict[prop] != "auto": value = style[prop] if value == getattr(self.profile, prop): result = "100%" else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / (72.0 / self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if "width" not in istate.attrib or "height" not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib["src"]) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn("Failed to find image:", href) else: try: width, height = identify_data(item.data)[:2] except: self.oeb.logger.warn("Invalid image:", href) else: if "width" not in istate.attrib and "height" not in istate.attrib: istate.attrib["width"] = str(width) istate.attrib["height"] = str(height) else: ar = float(width) / float(height) if "width" not in istate.attrib: try: width = int(istate.attrib["height"]) * ar except: pass istate.attrib["width"] = str(int(width)) else: try: height = int(istate.attrib["width"]) / ar except: pass istate.attrib["height"] = str(int(height)) item.unload_data_from_memory() elif tag == "hr" and asfloat(style["width"]) > 0: prop = style["width"] / self.profile.width istate.attrib["width"] = "%d%%" % int(round(prop * 100)) elif display == "table": tag = "table" elif display == "table-row": tag = "tr" elif display == "table-cell": tag = "td" if tag in TABLE_TAGS and self.ignore_tables: tag = "span" if tag == "td" else "div" if tag in ("table", "td", "tr"): col = style.backgroundColor if col: elem.set("bgcolor", col) css = style.cssdict() if "border" in css or "border-width" in css: elem.set("border", "1") if tag in TABLE_TAGS: for attr in ("rowspan", "colspan", "width", "border", "scope", "bgcolor"): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == "q": t = elem.text if not t: t = "" elem.text = u"\u201c" + t t = elem.tail if not t: t = "" elem.tail = u"\u201d" + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif ( len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, "rpartition") and elem[0].tag.rpartition("}")[-1] not in INLINE_TAGS ): text = None else: text = COLLAPSE.sub(" ", elem.text) valign = style["vertical-align"] not_baseline = valign in ("super", "sub", "text-top", "text-bottom", "top", "bottom") or ( isinstance(valign, (float, int)) and abs(valign) != 0 ) issup = valign in ("super", "text-top", "top") or (isinstance(valign, (float, int)) and valign > 0) vtag = "sup" if issup else "sub" if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML("html"), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML("body"))) vbstate.para = etree.SubElement(vbstate.body, XHTML("p")) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content("span", "", bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML("small")) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == "blockquote": old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if ( text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ("a", "span", "i", "b", "u") and len(elem) == 0 ) ): if tag == "li" and len(istates) > 1 and "value" in elem.attrib: try: value = int(elem.attrib["value"]) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(" ", child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == "blockquote": self.opts.mobi_ignore_margins = old_mim if bstate.content and style["page-break-after"] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == u"\xa0" and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML("br"))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style["margin-bottom"]) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style["padding-bottom"]) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, str) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, str) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM) if ems >= 1: text.append('\n\n\xa0' * ems) if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' text.append(self.build_block(tag, style, attribs, stylizer)) text.append('. ') tags.append('\n') if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: # noqa if self.in_a_link: text.append('_') tags.append('_') else: text.append('[_') tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: # noqa if self.in_a_link: text.append('*') tags.append('*') else: text.append('[*') tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: # noqa text.append('[+') tags.append('+]') self.style_embed.append('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: # noqa text.append('[-') tags.append('-]') self.style_embed.append('-') self.style_strike = True if tag == 'br': for i in reversed(self.style_embed): text.append(i) text.append('\n') for i in self.style_embed: text.append(i) tags.append('') self.remove_space_after_newline = True if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): text.append('') txt = attribs['title'] tags.append('(' + txt + ')') elif tag == 'sup': text.append('^') tags.append('^') elif tag == 'sub': text.append('~') tags.append('~') elif tag == 'code': if self.in_pre: text.append('\nbc. ') tags.append('') else: text.append('@') tags.append('@') elif tag == 'cite': text.append('??') tags.append('??') elif tag == 'hr': text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True text.append('\npre. ') tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: if 'href' in attribs: # noqa text.append('"') tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if 'title' in attribs: # noqa tags.append('(' + attribs['title'] + ')') self.in_a_link = True else: text.append('%') tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) txt += self.check_valign(style) txt += attribs['src'] text.append(txt) if 'alt' in attribs: # noqa txt = attribs['alt'] if txt != '': text.append('(' + txt + ')') tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name': tag, 'num': 0}) text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} text.append('\n') if li['name'] == 'ul': text.append('*' * len(self.list) + ' ') elif li['name'] == 'ol': text.append('#' * len(self.list) + ' ') tags.append('') elif tag == 'dl': text.append('\n') tags.append('') elif tag == 'dt': text.append('') tags.append('\n') elif tag == 'dd': text.append(' ') tags.append('') elif tag == 'dd': text.append('') tags.append('\n') elif tag == 'table': txt = self.build_block(tag, style, attribs, stylizer) txt += '. \n' if txt != '\ntable. \n': text.append(txt) else: text.append('\n') tags.append('') elif tag == 'tr': txt = self.build_block('', style, attribs, stylizer) txt += '. ' if txt != '\n. ': txt = re.sub('\n', '', txt) text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') txt = '' txt += self.check_halign(style) txt += self.check_valign(style) if 'colspan' in attribs: # noqa txt += '\\' + attribs['colspan'] if 'rowspan' in attribs: # noqa txt += '/' + attribs['rowspan'] txt += self.check_styles(style) if txt != '': text.append(txt + '. ') tags.append('') elif tag == 'th': text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': if self.style_smallcap == False: # noqa text.append('&') tags.append('&') self.style_smallcap = True else: if self.in_a_link == False: # noqa txt = '%' if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_styles(style) if txt != '%': text.append(txt) tags.append('%') if self.opts.keep_links and 'id' in attribs: # noqa if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span', 'table', 'tr', 'td'): if not self.in_a_link: text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: txt = self.prepare_string_for_textile(self.remove_newlines(txt)) text.append(txt) self.id_no_text = '' # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', 'li', 'table'): if t == 'pre': self.in_pre = False elif t in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: if t == 'a': self.in_a_link = False t = '' text.append(self.id_no_text) self.id_no_text = '' if t in ('*]', '*'): self.style_bold = False elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False elif t == '-]': self.style_strike = False elif t == '&': self.style_smallcap = False if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM) if ems >= 1: text.append('\n\n\xa0' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: tail = self.prepare_string_for_textile(self.remove_newlines(tail)) text.append(tail) return text
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): ''' This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. ''' from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace elem = elem_tree # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. if not isinstance(elem_tree.tag, string_or_bytes) or namespace( elem_tree.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close the tags. tags = [] # First tag in tree tag = barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except: ems = 0 # Convert TOC entries to <title>s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another section, # so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC pointed to # this page (then we use the first non-<body> on the page as a <title>), or # the TOC pointed to a specific element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if tag != 'body' and hasattr( elem_tree, 'text') and elem_tree.text: newlevel = 1 self.toc[page.href] = None if not newlevel and elem_tree.attrib.get('id', None) is not None: newlevel = toc_entry.get( elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be # only one XHTML tag but it can have multiple styles. if tag == 'img': if elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = '_%s.jpg' % len( self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack + tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line />' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line />' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack + tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack + tags) fb2_out += s_out tags += s_tags if tag in ('del', 'strike') or style['text-decoration'] == 'line-through': s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @page: OEB page used to determine absolute urls. ''' from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) tag_id = elem.attrib.get('id', None) in_block = False in_heading = False # Are we in a heading? # This can either be a heading tag or a TOC item. if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: in_heading = True if not self.last_was_heading: text.append('\n\n\n\n\n\n') # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if self.opts.remove_paragraph_spacing and not in_heading: text.append('\t') in_block = True if tag in SPACE_TAGS: text.append(' ') # Hard scene breaks. if tag == 'hr': text.append('\n\n* * *\n\n') # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n' * ems) except: pass # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(elem.text) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) if in_block: text.append('\n\n') if in_heading: text.append('\n') self.last_was_heading = True else: self.last_was_heading = False if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text
def dump_text(self, subitems, elem, stylizer, end='', pre=False, li=''): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if elem.attrib.get('id') is not None and elem.attrib['id'] in [href for href, title in subitems]: if self.curSubItem is not None and self.curSubItem != elem.attrib['id']: self.curSubItem = elem.attrib['id'] text.append(f'\n\n{CALIBRE_SNB_BM_TAG}{self.curSubItem}\n\n') if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) in_block = False # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: in_block = True if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text: text.append('\n\n') if tag in SPACE_TAGS: if not end.endswith('u ') and hasattr(elem, 'text') and elem.text: text.append(' ') if tag == 'img': text.append('\n\n{}{}\n\n'.format(CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src']))) if tag == 'br': text.append('\n\n') if tag == 'li': li = '- ' pre = (tag == 'pre' or pre) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: if pre: text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join((li + elem.text).splitlines())) else: text.append(li + elem.text) li = '' for item in elem: en = '' if len(text) >= 2: en = text[-1][-2:] t = self.dump_text(subitems, item, stylizer, en, pre, li)[0] text += t if in_block: text.append('\n\n') if hasattr(elem, 'tail') and elem.tail: if pre: text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join(elem.tail.splitlines())) else: text.append(li + elem.tail) li = '' return text, li
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM) if ems >= 1: text.append(u'\n\n\xa0' * ems) if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' text.append(self.build_block(tag, style, attribs, stylizer)) text.append('. ') tags.append('\n') if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: if self.in_a_link: text.append('_') tags.append('_') else: text.append('[_') tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: if self.in_a_link: text.append('*') tags.append('*') else: text.append('[*') tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: text.append('[+') tags.append('+]') self.style_embed.append('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: text.append('[-') tags.append('-]') self.style_embed.append('-') self.style_strike = True if tag == 'br': for i in reversed(self.style_embed): text.append(i) text.append('\n') for i in self.style_embed: text.append(i) tags.append('') self.remove_space_after_newline = True if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): text.append('') txt = attribs['title'] tags.append('(' + txt + ')') elif tag == 'sup': text.append('^') tags.append('^') elif tag == 'sub': text.append('~') tags.append('~') elif tag == 'code': if self.in_pre: text.append('\nbc. ') tags.append('') else: text.append('@') tags.append('@') elif tag == 'cite': text.append('??') tags.append('??') elif tag == 'hr': text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True text.append('\npre. ') tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: if attribs.has_key('href'): text.append('"') tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if attribs.has_key('title'): tags.append('(' + attribs['title'] + ')') self.in_a_link = True else: text.append('%') tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) txt += self.check_valign(style) txt += attribs['src'] text.append(txt) if attribs.has_key('alt'): txt = attribs['alt'] if txt != '': text.append('(' + txt + ')') tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name': tag, 'num': 0}) text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} text.append('\n') if li['name'] == 'ul': text.append('*' * len(self.list) + ' ') elif li['name'] == 'ol': text.append('#' * len(self.list) + ' ') tags.append('') elif tag == 'dl': text.append('\n') tags.append('') elif tag == 'dt': text.append('') tags.append('\n') elif tag == 'dd': text.append(' ') tags.append('') elif tag == 'dd': text.append('') tags.append('\n') elif tag == 'table': txt = self.build_block(tag, style, attribs, stylizer) txt += '. \n' if txt != '\ntable. \n': text.append(txt) else: text.append('\n') tags.append('') elif tag == 'tr': txt = self.build_block('', style, attribs, stylizer) txt += '. ' if txt != '\n. ': txt = re.sub ('\n', '', txt) text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') txt = '' txt += self.check_halign(style) txt += self.check_valign(style) if attribs.has_key ('colspan'): txt += '\\' + attribs['colspan'] if attribs.has_key ('rowspan'): txt += '/' + attribs['rowspan'] txt += self.check_styles(style) if txt != '': text.append(txt + '. ') tags.append('') elif tag == 'th': text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': if self.style_smallcap == False: text.append('&') tags.append('&') self.style_smallcap = True else: if self.in_a_link == False: txt = '%' if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_styles(style) if txt != '%': text.append(txt) tags.append('%') if self.opts.keep_links and attribs.has_key('id'): if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \ 'span', 'table', 'tr', 'td'): if not self.in_a_link: text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: txt = self.prepare_string_for_textile(self.remove_newlines(txt)) text.append(txt) self.id_no_text = u'' # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', 'li', 'table'): if t == 'pre': self.in_pre = False elif t in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: if t == 'a': self.in_a_link = False t = '' text.append(self.id_no_text) self.id_no_text = u'' if t in ('*]', '*'): self.style_bold = False elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False elif t == '-]': self.style_strike = False elif t == '&': self.style_smallcap = False if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM) if ems >= 1: text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: tail = self.prepare_string_for_textile(self.remove_newlines(tail)) text.append(tail) return text
def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [] text = [] tags = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] tag = barename(elem.tag) # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: tags.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument. if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref( elem.attrib['src']) not in self.image_hrefs.keys(): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref( elem.attrib['src'])] = 'cover.png' else: self.image_hrefs[page.abshref( elem.attrib['src'])] = image_name( '%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) elif tag == 'hr': w = r'\w' width = elem.get('width') if width: if not width.endswith('%'): width += '%' w += '="%s"' % width else: w += '="50%"' text.append(w) elif tag == 'br': text.append('\n\\c \n\\c\n') # TOC markers. toc_name = elem.attrib.get('name', None) toc_id = elem.attrib.get('id', None) # Only write the TOC marker if the tag isn't a heading and we aren't in one. if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \ 'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \ 'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \ 'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags: toc_page = page.href if self.toc.get(toc_page, None): for toc_x in (toc_name, toc_id): toc_title, toc_depth = self.toc[toc_page].get( toc_x, (None, 0)) if toc_title: toc_depth = max(min(toc_depth, 4), 0) text.append(r'\C{}="{}"'.format(toc_depth, toc_title)) # Process style information that needs holds a single tag. # Commented out because every page in an OEB book starts with this style. if style['page-break-before'] == 'always': text.append(r'\p') # Process basic PML tags. pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack + tags: text.append(r'\%s' % pml_tag) tags.append(pml_tag) # Special processing of tags that require an argument. # Anchors links if tag in LINK_TAGS and 'q' not in tag_stack + tags: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len( self.link_hrefs.keys()) href = '#%s' % self.link_hrefs[href] text.append(r'\q="%s"' % href) tags.append('q') # Anchor ids id_name = elem.get('id') name_name = elem.get('name') for name_x in (id_name, name_name): if name_x: text.append(self.get_anchor(page, name_x)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack + tags: text.append(r'\%s' % style_tag) tags.append(style_tag) # margin left try: mms = int(float(style['margin-left']) * 100 / style.height) if mms: text.append(r'\T="%s%%"' % mms) except: pass # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n\\c \n\\c\n') except: pass # Proccess text within this tag. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_pml(elem.text)) # Process inner tags for item in elem: text += self.dump_text(item, stylizer, page, tag_stack + tags) # Close opened tags. tags.reverse() text += self.close_tags(tags) # if tag in SEPARATE_TAGS: # text.append('\n\n') if style['page-break-after'] == 'always': text.append(r'\p') # Process text after this tag but not within another. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_pml(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib style_a = '%s' % style if tag == 'body': tag = 'div' if not style['page-break-before'] == 'always': style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a tags.append(tag) # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Turn style into strings for putting in the tag. style_t = '' if style_a: style_t = ' style="%s"' % style_a # Write the tag. text.append('<%s%s%s>' % (tag, at, style_t)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Turn styles into tags. if style['font-weight'] in ('bold', 'bolder'): text.append('<b>') tags.append('b') if style['font-style'] == 'italic': text.append('<i>') tags.append('i') if style['text-decoration'] == 'underline': text.append('<u>') tags.append('u') if style['text-decoration'] == 'line-through': text.append('<s>') tags.append('s') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def flatten_node(self, node, stylizer, names, styles, psize, item_id): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['text-align'] = val del node.attrib['align'] if node.tag == XHTML('font'): tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt'%font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: cssdict['color'] = node.attrib['color'] del node.attrib['color'] if 'bgcolor' in node.attrib: cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size if not self.context.disable_font_rescaling: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem'%(fsize/psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt'%fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. if style['line-height'] < minlh * fsize: cssdict['line-height'] = str(minlh) except: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: cssdict.pop(x, None) if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s'%(prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size if cssdict: items = cssdict.items() items.sort() css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' klass = STRIPNUM.sub('', classes.split()[0].replace('_', '')) if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] for child in node: self.flatten_node(child, stylizer, names, styles, psize, item_id)
def dump_text(self, elem, stylizer, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, namespace, barename, urlnormalize if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS and elem.tail: return elem.tail return u"" text = u"" style = stylizer.style(elem) if style["display"] in ("none", "oeb-page-head", "oeb-page-foot") or style["visibility"] == "hidden": if hasattr(elem, "tail") and elem.tail: return elem.tail return u"" tag = barename(elem.tag) tag_count = 0 # Are we in a paragraph block? if tag in BLOCK_TAGS or style["display"] in BLOCK_STYLES: if "block" not in tag_stack: tag_count += 1 tag_stack.append("block") # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == "img": src = elem.get("src") if src: src = urlnormalize(self.currently_dumping_item.abshref(src)) block_start = "" block_end = "" if "block" not in tag_stack: block_start = "{\\par\\pard\\hyphpar " block_end = "}" text += "%s SPECIAL_IMAGE-%s-REPLACE_ME %s" % (block_start, src, block_end) single_tag = SINGLE_TAGS.get(tag, None) if single_tag: text += single_tag rtf_tag = TAGS.get(tag, None) if rtf_tag and rtf_tag not in tag_stack: tag_count += 1 text += "{%s\n" % rtf_tag tag_stack.append(rtf_tag) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack: tag_count += 1 text += "{%s\n" % style_tag tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, "text") and elem.text: text += txt2rtf(elem.text) for item in elem: text += self.dump_text(item, stylizer, tag_stack) for i in range(0, tag_count): end_tag = tag_stack.pop() if end_tag != "block": if tag in BLOCK_TAGS: text += u"\\par\\pard\\plain\\hyphpar}" else: text += u"}" if hasattr(elem, "tail") and elem.tail: if "block" in tag_stack: text += "%s" % txt2rtf(elem.tail) else: text += "{\\par\\pard\\hyphpar %s}" % txt2rtf(elem.tail) return text
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): ''' This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. ''' from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace elem = elem_tree # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close the tags. tags = [] # First tag in tree tag = barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except: ems = 0 # Convert TOC entries to <title>s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another section, # so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC pointed to # this page (then we use the first non-<body> on the page as a <title>), or # the TOC pointed to a specific element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text: newlevel = 1 self.toc[page.href] = None if not newlevel and elem_tree.attrib.get('id', None) is not None: newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be # only one XHTML tag but it can have multiple styles. if tag == 'img': if elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s'%ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack+tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line />' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line />' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack+tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) fb2_out += s_out tags += s_tags if tag in ('del', 'strike') or style['text-decoration'] == 'line-through': s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out
def dump_text(self, elem, stylizer, tag_stack=[]): from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename, urlnormalize) if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return elem.tail return u'' text = u'' style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return elem.tail return u'' tag = barename(elem.tag) tag_count = 0 # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag == 'img': src = elem.get('src') if src: src = urlnormalize(self.currently_dumping_item.abshref(src)) block_start = '' block_end = '' if 'block' not in tag_stack: block_start = '{\\par\\pard\\hyphpar ' block_end = '}' text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) single_tag = SINGLE_TAGS.get(tag, None) if single_tag: text += single_tag rtf_tag = TAGS.get(tag, None) if rtf_tag and rtf_tag not in tag_stack: tag_count += 1 text += '{%s\n' % rtf_tag tag_stack.append(rtf_tag) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack: tag_count += 1 text += '{%s\n' % style_tag tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text: text += txt2rtf(elem.text) for item in elem: text += self.dump_text(item, stylizer, tag_stack) for i in range(0, tag_count): end_tag = tag_stack.pop() if end_tag != 'block': if tag in BLOCK_TAGS: text += u'\\par\\pard\\plain\\hyphpar}' else: text += u'}' if hasattr(elem, 'tail') and elem.tail: if 'block' in tag_stack: text += '%s' % txt2rtf(elem.tail) else: text += '{\\par\\pard\\hyphpar %s}' % txt2rtf(elem.tail) return text
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = XHTML('a') else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start']) - 1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') try: istate.indent = style['text-indent'] except Exception: istate.indent = 0 if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = convert_margin(style, 'margin-left') padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = convert_margin(style, 'margin-top') bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = convert_margin(style, 'margin-left') padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = ('\xa0' * spaces) + (elem.text or '') margin = convert_margin(style, 'margin-right') padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + ('\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + ('\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int( round(float(value) / (72 / self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = width / height if 'width' not in istate.attrib: try: width = int(istate.attrib['height']) * ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width']) / ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get( 'width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = '\u201c' + t t = elem.tail if not t: t = '' elem.tail = '\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or (isinstance( valign, numbers.Number) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or (isinstance( valign, numbers.Number) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML('small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem) == 0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == '\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML('br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = convert_margin(style, 'margin-bottom') bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) tag_count = 0 # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref( elem.attrib['src']) not in self.name_map.keys(): self.name_map[page.abshref( elem.attrib['src'])] = unique_name( '%s' % len(self.name_map.keys()), self.name_map.keys()) text.append('<IMG SRC="%s">' % self.name_map[page.abshref(elem.attrib['src'])]) rb_tag = tag.upper() if tag in TAGS else None if rb_tag: tag_count += 1 text.append('<%s>' % rb_tag) tag_stack.append(rb_tag) # Anchors links if tag in LINK_TAGS: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len( self.link_hrefs.keys()) href = self.link_hrefs[href] text.append('<A HREF="#%s">' % href) tag_count += 1 tag_stack.append('A') # Anchor ids id_name = elem.get('id') if id_name: text.append(self.get_anchor(page, id_name)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag: style_tag = style_tag.upper() tag_count += 1 text.append('<%s>' % style_tag) tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(prepare_string_for_xml(elem.text)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) close_tag_list = [] for i in range(0, tag_count): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) if hasattr(elem, 'tail') and elem.tail: text.append(prepare_string_for_xml(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Turn styles into tags. if style['font-weight'] in ('bold', 'bolder'): text.append('<b>') tags.append('b') if style['font-style'] == 'italic': text.append('<i>') tags.append('i') if style['text-decoration'] == 'underline': text.append('<u>') tags.append('u') if style['text-decoration'] == 'line-through': text.append('<s>') tags.append('s') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, string_or_bytes) \ or namespace(elem.tag) not in (XHTML_NS, SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] tags = [] tag = barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Remove attributes we won't want. if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): k = k.split('}')[-1] at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @page: OEB page used to determine absolute urls. ''' from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = barename(elem.tag) tag_id = elem.attrib.get('id', None) in_block = False in_heading = False # Are we in a heading? # This can either be a heading tag or a TOC item. if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: in_heading = True if not self.last_was_heading: text.append('\n\n\n\n\n\n') # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if self.opts.remove_paragraph_spacing and not in_heading: text.append(u'\t') in_block = True if tag in SPACE_TAGS: text.append(u' ') # Hard scene breaks. if tag == 'hr': text.append('\n\n* * *\n\n') # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n' * ems) except: pass # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(elem.text) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) if in_block: text.append(u'\n\n') if in_heading: text.append(u'\n') self.last_was_heading = True else: self.last_was_heading = False if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, (int, float)): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if node.tag == XHTML('font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and len(node.text) == 1) # Detect drop caps generated by the docx input plugin if (node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and not node.tail and 'line-height' in cssdict and 'font-size' in cssdict): dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. if not is_drop_cap and style['line-height'] < minlh * fsize: cssdict['line-height'] = str(minlh) except: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: cssdict.pop(x, None) if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(cssdict.items()) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' klass = ascii_text( STRIPNUM.sub('', classes.split()[0].replace('_', ''))) if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in pseudo_classes.iteritems(): items = sorted(cssdict.iteritems()) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id): if not isinstance(node.tag, basestring) or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style["font-size"] except: font_size = self.sbase if self.sbase is not None else self.context.source.fbase if "align" in node.attrib: if tag != "img": cssdict["text-align"] = node.attrib["align"] else: val = node.attrib["align"] if val in ("middle", "bottom", "top"): cssdict["vertical-align"] = val elif val in ("left", "right"): cssdict["float"] = val del node.attrib["align"] if node.tag == XHTML("font"): tags = [ "descendant::h:%s" % x for x in ("p", "div", "table", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "dl", "blockquote") ] tag = "div" if XPath("|".join(tags))(node) else "span" node.tag = XHTML(tag) if "size" in node.attrib: def force_int(raw): return int(re.search(r"([0-9+-]+)", raw).group(1)) size = node.attrib["size"].strip() if size: fnums = self.context.source.fnums if size[0] in ("+", "-"): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict["font-size"] = "%.1fpt" % font_size del node.attrib["size"] if "face" in node.attrib: cssdict["font-family"] = node.attrib["face"] del node.attrib["face"] if "color" in node.attrib: try: cssdict["color"] = Property("color", node.attrib["color"]).value except (ValueError, SyntaxErr): pass del node.attrib["color"] if "bgcolor" in node.attrib: try: cssdict["background-color"] = Property("background-color", node.attrib["bgcolor"]).value except (ValueError, SyntaxErr): pass del node.attrib["bgcolor"] if cssdict.get("font-weight", "").lower() == "medium": cssdict["font-weight"] = "normal" # ADE chokes on font-weight medium fsize = font_size is_drop_cap = ( cssdict.get("float", None) == "left" and "font-size" in cssdict and len(node) == 0 and node.text and len(node.text) == 1 ) is_drop_cap = is_drop_cap or ( # The docx input plugin generates drop caps that look like this len(node) == 1 and not node.text and len(node[0]) == 0 and node[0].text and not node[0].tail and len(node[0].text) == 1 and "line-height" in cssdict and "font-size" in cssdict ) if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict["font-size"] = "%0.5fem" % (fsize / psize) psize = fsize elif "font-size" in cssdict or tag == "body": fsize = self.fmap[font_size] try: cssdict["font-size"] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict["font-size"] = "%.1fpt" % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100.0 if not is_drop_cap and style["line-height"] < minlh * fsize: cssdict["line-height"] = str(minlh) except: self.oeb.logger.exception("Failed to set minimum line-height") if cssdict: for x in self.filter_css: cssdict.pop(x, None) if cssdict: if self.lineh and self.fbase and tag != "body": self.clean_edges(cssdict, style, psize) if "display" in cssdict and cssdict["display"] == "in-line": cssdict["display"] = "inline" if self.unfloat and "float" in cssdict and cssdict.get("display", "none") != "none": del cssdict["display"] if self.untable and "display" in cssdict and cssdict["display"].startswith("table"): display = cssdict["display"] if display == "table-cell": cssdict["display"] = "inline" else: cssdict["display"] = "block" if "vertical-align" in cssdict and cssdict["vertical-align"] == "sup": cssdict["vertical-align"] = "super" if self.lineh and "line-height" not in cssdict: lineh = self.lineh / psize cssdict["line-height"] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ("p", "div"): if item_id != "calibre_jacket" or self.context.output_profile.name == "Kindle": for prop in ("margin", "padding", "border"): for edge in ("top", "bottom"): cssdict["%s-%s" % (prop, edge)] = "0pt" if self.context.insert_blank_line: cssdict["margin-top"] = cssdict["margin-bottom"] = "%fem" % self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if ( self.context.remove_paragraph_spacing and not keep_indents and cssdict.get("text-align", None) not in ("center", "right") ): cssdict["text-indent"] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(cssdict.items()) css = u";\n".join(u"%s: %s" % (key, val) for key, val in items) classes = node.get("class", "").strip() or "calibre" klass = ascii_text(STRIPNUM.sub("", classes.split()[0].replace("_", ""))) if css in styles: match = styles[css] else: match = klass + str(names[klass] or "") styles[css] = match names[klass] += 1 node.attrib["class"] = match keep_classes.add(match) for psel, cssdict in pseudo_classes.iteritems(): items = sorted(cssdict.iteritems()) css = u";\n".join(u"%s: %s" % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = "pcalibre" match = klass + str(names[klass] or "") pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib["class"] = " ".join(keep_classes) elif "class" in node.attrib: del node.attrib["class"] if "style" in node.attrib: del node.attrib["style"] for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [u''] text = [u''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [u''] tag = barename(elem.tag) tag_count = 0 # Process tags that need special processing and that do not have inner # text. Usually these require an argument if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.name_map.keys(): self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.name_map.keys()), self.name_map.keys()) text.append('<IMG SRC="%s">' % self.name_map[page.abshref(elem.attrib['src'])]) rb_tag = tag.upper() if tag in TAGS else None if rb_tag: tag_count += 1 text.append('<%s>' % rb_tag) tag_stack.append(rb_tag) # Anchors links if tag in LINK_TAGS: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = self.link_hrefs[href] text.append('<A HREF="#%s">' % href) tag_count += 1 tag_stack.append('A') # Anchor ids id_name = elem.get('id') if id_name: text.append(self.get_anchor(page, id_name)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag: style_tag = style_tag.upper() tag_count += 1 text.append('<%s>' % style_tag) tag_stack.append(style_tag) # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(prepare_string_for_xml(elem.text)) for item in elem: text += self.dump_text(item, stylizer, page, tag_stack) close_tag_list = [] for i in range(0, tag_count): close_tag_list.insert(0, tag_stack.pop()) text += self.close_tags(close_tag_list) if hasattr(elem, 'tail') and elem.tail: text.append(prepare_string_for_xml(elem.tail)) return text
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id): if not isinstance(node.tag, string_or_bytes) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict['margin-right'] = 'auto' else: for table in node.iterchildren(XHTML("table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get('margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == XHTML('font'): tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt'%font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and ( len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem'%(fsize/psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt'%fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. if not is_drop_cap and style['line-height'] < minlh * fsize: cssdict['line-height'] = str(minlh) except: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s'%(prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(iteritems(cssdict)) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub('', classes.split()[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in iteritems(pseudo_classes): items = sorted(iteritems(cssdict)) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): if not isinstance(node.tag, string_or_bytes) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict[ 'margin-right'] = 'auto' else: for table in node.iterchildren(XHTML("table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get( 'margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == XHTML('font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = ( cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = node.attrib.pop('data-calibre-rescale', None) if dyn_rescale is not None: try: dyn_rescale = float(dyn_rescale) / 100 except Exception: dyn_rescale = 1 fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. slh = style['line-height'] if not is_drop_cap and isinstance( slh, numbers.Number) and slh < minlh * fsize: cssdict['line-height'] = unicode_type(minlh) except Exception: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(iteritems(cssdict)) css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' classes_list = classes.split() # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub( '', classes_list[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + unicode_type(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in iteritems(pseudo_classes): items = sorted(iteritems(cssdict)) css = ';\n'.join('%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + unicode_type(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] if recurse: for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)