def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, string_or_bytes) or namespace(elem.tag) != XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ and elem.tail: return [elem.tail] return [] text = [] tags = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] tag = barename(elem.tag) # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: tags.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument. if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' else: self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name( '%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) elif tag == 'hr': w = r'\w' width = elem.get('width') if width: if not width.endswith('%'): width += '%' w += '="%s"' % width else: w += '="50%"' text.append(w) elif tag == 'br': text.append('\n\\c \n\\c\n') # TOC markers. toc_name = elem.attrib.get('name', None) toc_id = elem.attrib.get('id', None) # Only write the TOC marker if the tag isn't a heading and we aren't in one. if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \ 'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \ 'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \ 'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags: toc_page = page.href if self.toc.get(toc_page, None): for toc_x in (toc_name, toc_id): toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0)) if toc_title: toc_depth = max(min(toc_depth, 4), 0) text.append(fr'\C{toc_depth}="{toc_title}"') # Process style information that needs holds a single tag. # Commented out because every page in an OEB book starts with this style. if style['page-break-before'] == 'always': text.append(r'\p') # Process basic PML tags. pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack+tags: text.append(r'\%s' % pml_tag) tags.append(pml_tag) # Special processing of tags that require an argument. # Anchors links if tag in LINK_TAGS and 'q' not in tag_stack+tags: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = '#%s' % self.link_hrefs[href] text.append(r'\q="%s"' % href) tags.append('q') # Anchor ids id_name = elem.get('id') name_name = elem.get('name') for name_x in (id_name, name_name): if name_x: text.append(self.get_anchor(page, name_x)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack+tags: text.append(r'\%s' % style_tag) tags.append(style_tag) # margin left try: mms = int(float(style['margin-left']) * 100 / style.height) if mms: text.append(r'\T="%s%%"' % mms) except: pass # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n\\c \n\\c\n') except: pass # Process text within this tag. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_pml(elem.text)) # Process inner tags for item in elem: text += self.dump_text(item, stylizer, page, tag_stack+tags) # Close opened tags. tags.reverse() text += self.close_tags(tags) # if tag in SEPARATE_TAGS: # text.append('\n\n') if style['page-break-after'] == 'always': text.append(r'\p') # Process text after this tag but not within another. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_pml(elem.tail)) return text
def dump_text(self, elem, stylizer, page, tag_stack=[]): from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return [] text = [] tags = [] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [] tag = barename(elem.tag) # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: tags.append('block') # Process tags that need special processing and that do not have inner # text. Usually these require an argument. if tag in IMAGE_TAGS: if elem.attrib.get('src', None): if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys(): if len(self.image_hrefs.keys()) == 0: self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png' else: self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00') text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])]) elif tag == 'hr': w = '\\w' width = elem.get('width') if width: if not width.endswith('%'): width += '%' w += '="%s"' % width else: w += '="50%"' text.append(w) elif tag == 'br': text.append('\n\\c \n\\c\n') # TOC markers. toc_name = elem.attrib.get('name', None) toc_id = elem.attrib.get('id', None) # Only write the TOC marker if the tag isn't a heading and we aren't in one. if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \ 'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \ 'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \ 'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags: toc_page = page.href if self.toc.get(toc_page, None): for toc_x in (toc_name, toc_id): toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0)) if toc_title: toc_depth = max(min(toc_depth, 4), 0) text.append('\\C%s="%s"' % (toc_depth, toc_title)) # Process style information that needs holds a single tag. # Commented out because every page in an OEB book starts with this style. if style['page-break-before'] == 'always': text.append('\\p') # Process basic PML tags. pml_tag = TAG_MAP.get(tag, None) if pml_tag and pml_tag not in tag_stack+tags: text.append('\\%s' % pml_tag) tags.append(pml_tag) # Special processing of tags that require an argument. # Anchors links if tag in LINK_TAGS and 'q' not in tag_stack+tags: href = elem.get('href') if href: href = page.abshref(href) if '://' not in href: if '#' not in href: href += '#' if href not in self.link_hrefs.keys(): self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys()) href = '#%s' % self.link_hrefs[href] text.append('\\q="%s"' % href) tags.append('q') # Anchor ids id_name = elem.get('id') name_name = elem.get('name') for name_x in (id_name, name_name): if name_x: text.append(self.get_anchor(page, name_x)) # Processes style information for s in STYLES: style_tag = s[1].get(style[s[0]], None) if style_tag and style_tag not in tag_stack+tags: text.append('\\%s' % style_tag) tags.append(style_tag) # margin left try: mms = int(float(style['margin-left']) * 100 / style.height) if mms: text.append('\\T="%s%%"' % mms) except: pass # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n\\c \n\\c\n') except: pass # Proccess text within this tag. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_pml(elem.text)) # Process inner tags for item in elem: text += self.dump_text(item, stylizer, page, tag_stack+tags) # Close opened tags. tags.reverse() text += self.close_tags(tags) #if tag in SEPARATE_TAGS: # text.append('\n\n') if style['page-break-after'] == 'always': text.append('\\p') # Process text after this tag but not within another. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_pml(elem.tail)) return text