class Convert(object): def __init__(self, oeb, docx): self.oeb, self.docx = oeb, docx self.log, self.opts = docx.log, docx.opts def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager) for item in self.oeb.spine: self.process_item(item) self.styles_manager.finalize(self.blocks.all_blocks) self.write() def process_item(self, item): stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) self.abshref = self.images_manager.abshref = item.abshref for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False): tagname = barename(html_tag.tag) if tagname in {'script', 'style', 'title', 'meta'}: return tag_style = stylizer.style(html_tag) if tag_style.is_hidden: return display = tag_style._get('display') if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': # TODO: Implement this self.add_block_tag(tagname, html_tag, tag_style, stylizer) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and tag_style['float'] in {'left', 'right'}: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: self.add_block_tag(tagname, html_tag, tag_style, stylizer) for child in html_tag.iterchildren('*'): self.process_tag(child, stylizer) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True) def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer) else: if html_tag.text: block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True) def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): if tagname == 'br': if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none')) elif tagname == 'img': block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) self.images_manager.add_image(html_tag, block, stylizer) else: if html_tag.text: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.text, tag_style, is_parent_style=False) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
class Convert(object): def __init__(self, oeb, docx): self.oeb, self.docx = oeb, docx self.log, self.opts = docx.log, docx.opts def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager() self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.fonts_manager = FontsManager(self.oeb, self.opts) self.blocks = Blocks(self.styles_manager) for item in self.oeb.spine: self.process_item(item) self.styles_manager.finalize(self.blocks.all_blocks) self.write() def process_item(self, item): stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) self.abshref = self.images_manager.abshref = item.abshref for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False): tagname = barename(html_tag.tag) if tagname in {'script', 'style', 'title', 'meta'}: return tag_style = stylizer.style(html_tag) if tag_style.is_hidden: return display = tag_style._get('display') if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': # TODO: Implement this self.add_block_tag(tagname, html_tag, tag_style, stylizer) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and tag_style['float'] in {'left', 'right'}: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: self.add_block_tag(tagname, html_tag, tag_style, stylizer) for child in html_tag.iterchildren('*'): self.process_tag(child, stylizer) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True if display == 'table-row': return # We ignore the tail for these tags if not is_first_tag and html_tag.tail and (not is_block or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True) def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer) else: if html_tag.text: block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True) def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): if tagname == 'br': if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none')) elif tagname == 'img': block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) self.images_manager.add_image(html_tag, block, stylizer) else: if html_tag.text: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.text, tag_style, is_parent_style=False) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
class Convert: # Word does not apply default styling to hyperlinks, so we ensure they get # default styling (the conversion pipeline does not apply any styling to # them). base_css = ''' a[href] { text-decoration: underline; color: blue } ''' def __init__(self, oeb, docx, mi, add_cover, add_toc): self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc self.log, self.opts = docx.log, docx.opts self.mi = mi self.cover_img = None p = self.opts.output_profile p.width_pts, p.height_pts = page_effective_area(self.opts) def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.current_link = self.current_lang = None for item in self.oeb.spine: self.log.debug('Processing', item.href) self.process_item(item) if self.add_toc: self.links_manager.process_toc_links(self.oeb) if self.add_cover and self.oeb.metadata.cover and str( self.oeb.metadata.cover[0]) in self.oeb.manifest.ids: cover_id = str(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids[cover_id] self.cover_img = self.images_manager.read_image(item.href) all_blocks = self.blocks.all_blocks remove_blocks = [] for i, block in enumerate(all_blocks): try: nb = all_blocks[i + 1] except IndexError: break block.resolve_skipped(nb) if block.skipped: remove_blocks.append((i, block)) for pos, block in reversed(remove_blocks): self.blocks.delete_block_at(pos) self.blocks.all_blocks[0].is_first_block = True self.blocks.apply_page_break_after() self.blocks.resolve_language() if self.cover_img is not None: self.cover_img = self.images_manager.create_cover_markup( self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts)) self.lists_manager.finalize(all_blocks) self.styles_manager.finalize(all_blocks) self.write() def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag( item.data) or self.styles_manager.document_lang for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor( self.links_manager.top_anchor, self.current_item, body) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = barename(html_tag.tag) tag_style = stylizer.style(html_tag) ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta' } or tag_style.is_hidden display = tag_style._get('display') is_block = False if not ignore_tag_contents: previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) previous_lang = self.current_lang tag_lang = lang_for_tag(html_tag) if tag_lang: self.current_lang = tag_lang is_float = tag_style['float'] in {'left', 'right' } and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in { 'inline', 'inline-block' } or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if is_float and float_spec.is_dropcaps: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) float_spec = None else: self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and is_float: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: if tagname == 'hr': for edge in 'right bottom left'.split(): tag_style.set('border-%s-style' % edge, 'none') self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) for child in html_tag.iterchildren(): if isinstance(getattr(child, 'tag', None), string_or_bytes): self.process_tag(child, stylizer, float_spec=float_spec) else: # Comment/PI/etc. tail = getattr(child, 'tail', None) if tail: block = self.create_block_from_parent( html_tag, stylizer) block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True self.current_link = previous_link self.current_lang = previous_lang # Now, process the tail if any if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and ( not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.create_block_from_parent(html_tag, stylizer) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang) def create_block_from_parent(self, html_tag, stylizer): parent = html_tag.getparent() block = self.blocks.current_or_new_block(parent, stylizer.style(parent)) # Do not inherit page-break-before from parent block.page_break_before = False return block def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) anchor = html_tag.get('id') or html_tag.get('name') if anchor: block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer, as_block=True) else: text = html_tag.text is_list_item = tagname == 'li' has_sublist = is_list_item and len(html_tag) and barename( html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]) if text and has_sublist and not text.strip(): text = '' # whitespace only, ignore if text: block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) elif has_sublist: block.force_not_empty = True def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): anchor = html_tag.get('id') or html_tag.get('name') or None bmark = None if anchor: bmark = self.bookmark_for_anchor(anchor, html_tag) if tagname == 'br': if html_tag.tail or html_tag is not tuple( html_tag.getparent().iterchildren('*'))[-1]: block = self.create_block_from_parent(html_tag, stylizer) block.add_break(clear={ 'both': 'all', 'left': 'left', 'right': 'right' }.get(tag_style['clear'], 'none'), bookmark=bmark) elif tagname == 'img': block = self.create_block_from_parent(html_tag, stylizer) self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark) else: if html_tag.text: block = self.create_block_from_parent(html_tag, stylizer) block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) elif bmark: block = self.create_block_from_parent(html_tag, stylizer) block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) def bookmark_for_anchor(self, anchor, html_tag): return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end if self.links_manager.toc: self.links_manager.serialize_toc( body, self.styles_manager.primary_heading_style) if self.cover_img is not None: self.images_manager.write_cover_block(body, self.cover_img) self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts) self.lists_manager.serialize(self.docx.numbering)
class Convert(object): # Word does not apply default styling to hyperlinks, so we ensure they get # default styling (the conversion pipeline does not apply any styling to # them). base_css = ''' a[href] { text-decoration: underline; color: blue } ''' def __init__(self, oeb, docx): self.oeb, self.docx = oeb, docx self.log, self.opts = docx.log, docx.opts def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.current_link = None for item in self.oeb.spine: self.process_item(item) all_blocks = self.blocks.all_blocks remove_blocks = [] for i, block in enumerate(all_blocks): try: nb = all_blocks[i+1] except IndexError: break block.resolve_skipped(nb) if block.skipped: remove_blocks.append((i, block)) for pos, block in reversed(remove_blocks): self.blocks.delete_block_at(pos) self.blocks.all_blocks[0].is_first_block = True self.lists_manager.finalize(all_blocks) self.styles_manager.finalize(all_blocks) self.write() def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: body.set('id', body.get('id', None) or self.links_manager.top_anchor) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = barename(html_tag.tag) if tagname in {'script', 'style', 'title', 'meta'}: return tag_style = stylizer.style(html_tag) if tag_style.is_hidden: return previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) display = tag_style._get('display') is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if is_float and float_spec.is_dropcaps: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) float_spec = None else: self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and is_float: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) for child in html_tag.iterchildren('*'): self.process_tag(child, stylizer, float_spec=float_spec) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True self.current_link = previous_link if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link) def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) anchor = html_tag.get('id') or html_tag.get('name') if anchor: block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer, as_block=True) else: if html_tag.text: block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link) def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): anchor = html_tag.get('id') or html_tag.get('name') or None bmark = None if anchor: bmark = self.bookmark_for_anchor(anchor, html_tag) if tagname == 'br': if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark) elif tagname == 'img': block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark) else: if html_tag.text: block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link) def bookmark_for_anchor(self, anchor, html_tag): return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts) self.lists_manager.serialize(self.docx.numbering)
class Convert(object): # Word does not apply default styling to hyperlinks, so we ensure they get # default styling (the conversion pipeline does not apply any styling to # them). base_css = ''' a[href] { text-decoration: underline; color: blue } ''' def __init__(self, oeb, docx, mi, add_cover, add_toc): self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc self.log, self.opts = docx.log, docx.opts self.mi = mi self.cover_img = None p = self.opts.output_profile p.width_pts, p.height_pts = page_effective_area(self.opts) def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.current_link = self.current_lang = None for item in self.oeb.spine: self.log.debug('Processing', item.href) self.process_item(item) if self.add_toc: self.links_manager.process_toc_links(self.oeb) if self.add_cover and self.oeb.metadata.cover and unicode(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids: cover_id = unicode(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids[cover_id] self.cover_img = self.images_manager.read_image(item.href) all_blocks = self.blocks.all_blocks remove_blocks = [] for i, block in enumerate(all_blocks): try: nb = all_blocks[i+1] except IndexError: break block.resolve_skipped(nb) if block.skipped: remove_blocks.append((i, block)) for pos, block in reversed(remove_blocks): self.blocks.delete_block_at(pos) self.blocks.all_blocks[0].is_first_block = True self.blocks.apply_page_break_after() self.blocks.resolve_language() if self.cover_img is not None: self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts)) self.lists_manager.finalize(all_blocks) self.styles_manager.finalize(all_blocks) self.write() def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = barename(html_tag.tag) tag_style = stylizer.style(html_tag) ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden display = tag_style._get('display') is_block = False if not ignore_tag_contents: previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) previous_lang = self.current_lang tag_lang = lang_for_tag(html_tag) if tag_lang: self.current_lang = tag_lang is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in {'inline', 'inline-block'} or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if is_float and float_spec.is_dropcaps: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) float_spec = None else: self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and is_float: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: if tagname == 'hr': for edge in 'right bottom left'.split(): tag_style.set('border-%s-style' % edge, 'none') self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) for child in html_tag.iterchildren(): if isinstance(getattr(child, 'tag', None), basestring): self.process_tag(child, stylizer, float_spec=float_spec) else: # Comment/PI/etc. tail = getattr(child, 'tail', None) if tail: block = self.create_block_from_parent(html_tag, stylizer) block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True self.current_link = previous_link self.current_lang = previous_lang # Now, process the tail if any if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.create_block_from_parent(html_tag, stylizer) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang) def create_block_from_parent(self, html_tag, stylizer): parent = html_tag.getparent() block = self.blocks.current_or_new_block(parent, stylizer.style(parent)) # Do not inherit page-break-before from parent block.page_break_before = False return block def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): block = self.blocks.start_new_block( html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) anchor = html_tag.get('id') or html_tag.get('name') if anchor: block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer, as_block=True) else: if html_tag.text: block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): anchor = html_tag.get('id') or html_tag.get('name') or None bmark = None if anchor: bmark = self.bookmark_for_anchor(anchor, html_tag) if tagname == 'br': if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]: block = self.create_block_from_parent(html_tag, stylizer) block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark) elif tagname == 'img': block = self.create_block_from_parent(html_tag, stylizer) self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark) else: if html_tag.text: block = self.create_block_from_parent(html_tag, stylizer) block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) elif bmark: block = self.create_block_from_parent(html_tag, stylizer) block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang) def bookmark_for_anchor(self, anchor, html_tag): return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end if self.links_manager.toc: self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style) if self.cover_img is not None: self.images_manager.write_cover_block(body, self.cover_img) self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts) self.lists_manager.serialize(self.docx.numbering)
class Convert(object): # Word does not apply default styling to hyperlinks, so we ensure they get # default styling (the conversion pipeline does not apply any styling to # them). base_css = ''' a[href] { text-decoration: underline; color: blue } ''' def __init__(self, oeb, docx): self.oeb, self.docx = oeb, docx self.log, self.opts = docx.log, docx.opts def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.current_link = None for item in self.oeb.spine: self.process_item(item) all_blocks = self.blocks.all_blocks remove_blocks = [] for i, block in enumerate(all_blocks): try: nb = all_blocks[i + 1] except IndexError: break block.resolve_skipped(nb) if block.skipped: remove_blocks.append((i, block)) for pos, block in reversed(remove_blocks): self.blocks.delete_block_at(pos) self.blocks.all_blocks[0].is_first_block = True self.lists_manager.finalize(all_blocks) self.styles_manager.finalize(all_blocks) self.write() def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref for i, body in enumerate(XPath('//h:body')(item.data)): with self.blocks: body.set('id', body.get('id', None) or self.links_manager.top_anchor) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = barename(html_tag.tag) if tagname in {'script', 'style', 'title', 'meta'}: return tag_style = stylizer.style(html_tag) if tag_style.is_hidden: return previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) display = tag_style._get('display') is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in { 'inline', 'inline-block' } or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if is_float and float_spec.is_dropcaps: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) float_spec = None else: self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and is_float: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) for child in html_tag.iterchildren('*'): self.process_tag(child, stylizer, float_spec=float_spec) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True self.current_link = previous_link if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and ( not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.blocks.current_or_new_block( html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link) def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) anchor = html_tag.get('id') or html_tag.get('name') if anchor: block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer, as_block=True) else: if html_tag.text: block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link) def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): anchor = html_tag.get('id') or html_tag.get('name') or None bmark = None if anchor: bmark = self.bookmark_for_anchor(anchor, html_tag) if tagname == 'br': if html_tag.tail or html_tag is not tuple( html_tag.getparent().iterchildren('*'))[-1]: block = self.blocks.current_or_new_block( html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_break(clear={ 'both': 'all', 'left': 'left', 'right': 'right' }.get(tag_style['clear'], 'none'), bookmark=bmark) elif tagname == 'img': block = self.blocks.current_or_new_block( html_tag.getparent(), stylizer.style(html_tag.getparent())) self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark) else: if html_tag.text: block = self.blocks.current_or_new_block( html_tag.getparent(), stylizer.style(html_tag.getparent())) block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link) def bookmark_for_anchor(self, anchor, html_tag): return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag) def write(self): self.docx.document, self.docx.styles, body = create_skeleton(self.opts) self.blocks.serialize(body) body.append(body[0]) # Move <sectPr> to the end self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts) self.lists_manager.serialize(self.docx.numbering)
class Convert(object): def __init__(self, oeb, docx): self.oeb, self.docx = oeb, docx self.log, self.opts = docx.log, docx.opts self.blocks = [] def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer() self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager() self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.fonts_manager = FontsManager(self.oeb) for item in self.oeb.spine: self.process_item(item) self.styles_manager.finalize(self.blocks) self.write() def process_item(self, item): stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) self.abshref = self.images_manager.abshref = item.abshref is_first_block = True for body in XPath('//h:body')(item.data): b = Block(self.styles_manager, body, stylizer.style(body), is_first_block=is_first_block) self.blocks.append(b) is_first_block = False self.process_block(body, b, stylizer, ignore_tail=True) if self.blocks and self.blocks[0].is_empty(): del self.blocks[0] def process_block(self, html_block, docx_block, stylizer, ignore_tail=False): block_style = stylizer.style(html_block) if block_style.is_hidden: return if html_block.tag.endswith('}img'): self.images_manager.add_image(html_block, docx_block, stylizer) else: if html_block.text: docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True) for child in html_block.iterchildren(etree.Element): tag = barename(child.tag) style = stylizer.style(child) display = style._get('display') if display == 'block' and tag != 'br': if tag == 'img' and style['float'] in {'left', 'right'}: # Image is floating so dont start a new paragraph for # it self.process_inline(child, self.blocks[-1], stylizer) else: b = Block(self.styles_manager, child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: self.process_inline(child, self.blocks[-1], stylizer) if block_style['page-break-after'] == 'avoid': self.blocks[-1].keep_next = True if ignore_tail is False and html_block.tail and html_block.tail.strip(): style = stylizer.style(html_block.getparent()) b = Block(self.styles_manager, html_block.getparent(), style) self.blocks.append(b) b.add_text(html_block.tail, style, is_parent_style=True) def process_inline(self, html_child, docx_block, stylizer): tag = barename(html_child.tag) style = stylizer.style(html_child) if style.is_hidden: return if tag == 'br': if html_child.tail or html_child is not html_child.getparent()[-1]: docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none')) elif tag == 'img': self.images_manager.add_image(html_child, docx_block, stylizer) else: if html_child.text: docx_block.add_text(html_child.text, style, html_parent=html_child) for child in html_child.iterchildren(etree.Element): style = stylizer.style(child) display = style.get('display', 'inline') if display == 'block': b = Block(self.styles_manager, child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: self.process_inline(child, self.blocks[-1], stylizer) if html_child.tail: self.blocks[-1].add_text(html_child.tail, stylizer.style(html_child.getparent()), html_parent=html_child.getparent(), is_parent_style=True) def write(self): dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}} E = ElementMaker(namespace=dn['w'], nsmap=dn) self.docx.document = doc = E.document() body = E.body() doc.append(body) for block in self.blocks: block.serialize(body) width, height = PAPER_SIZES[self.opts.docx_page_size] if self.opts.docx_custom_page_size is not None: width, height = map(float, self.opts.docx_custom_page_size.partition('x')[0::2]) width, height = int(20 * width), int(20 * height) def margin(which): return w(which), str(int(getattr(self.opts, 'margin_'+which) * 20)) body.append(E.sectPr( E.pgSz(**{w('w'):str(width), w('h'):str(height)}), E.pgMar(**dict(map(margin, 'left top right bottom'.split()))), E.cols(**{w('space'):'720'}), E.docGrid(**{w('linePitch'):"360"}), )) dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)} E = ElementMaker(namespace=dn['w'], nsmap=dn) self.docx.styles = E.styles( E.docDefaults( E.rPrDefault( E.rPr( E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}), E.sz(**{w('val'):'22'}), E.szCs(**{w('val'):'22'}), E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"}) ) ), E.pPrDefault( E.pPr( E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"}) ) ) ) ) self.docx.images = {} self.styles_manager.serialize(self.docx.styles) self.images_manager.serialize(self.docx.images) self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts)