Example #1
0
    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer()
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace)
        self.images_manager = ImagesManager(self.oeb,
                                            self.docx.document_relationships)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb,
                                          self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager)

        for item in self.oeb.spine:
            self.process_item(item)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i + 1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)

        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()
Example #2
0
    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace, self.log,
                                            self.mi.language)
        self.links_manager = LinksManager(self.docx.namespace,
                                          self.docx.document_relationships,
                                          self.log)
        self.images_manager = ImagesManager(self.oeb,
                                            self.docx.document_relationships,
                                            self.opts)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb,
                                          self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager,
                             self.links_manager)
        self.current_link = self.current_lang = None

        for item in self.oeb.spine:
            self.log.debug('Processing', item.href)
            self.process_item(item)
        if self.add_toc:
            self.links_manager.process_toc_links(self.oeb)

        if self.add_cover and self.oeb.metadata.cover and str(
                self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
            cover_id = str(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids[cover_id]
            self.cover_img = self.images_manager.read_image(item.href)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i + 1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True
        self.blocks.apply_page_break_after()
        self.blocks.resolve_language()

        if self.cover_img is not None:
            self.cover_img = self.images_manager.create_cover_markup(
                self.cover_img, self.opts.preserve_cover_aspect_ratio,
                *page_size(self.opts))
        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()
Example #3
0
    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace)
        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships)
        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
        self.current_link = None

        for item in self.oeb.spine:
            self.process_item(item)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i+1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True

        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()
Example #4
0
    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
        self.current_link = self.current_lang = None

        for item in self.oeb.spine:
            self.log.debug('Processing', item.href)
            self.process_item(item)
        if self.add_toc:
            self.links_manager.process_toc_links(self.oeb)

        if self.add_cover and self.oeb.metadata.cover and unicode(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
            cover_id = unicode(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids[cover_id]
            self.cover_img = self.images_manager.read_image(item.href)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i+1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True
        self.blocks.apply_page_break_after()
        self.blocks.resolve_language()

        if self.cover_img is not None:
            self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()
Example #5
0
class Convert:

    # Word does not apply default styling to hyperlinks, so we ensure they get
    # default styling (the conversion pipeline does not apply any styling to
    # them).
    base_css = '''
    a[href] { text-decoration: underline; color: blue }
    '''

    def __init__(self, oeb, docx, mi, add_cover, add_toc):
        self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
        self.log, self.opts = docx.log, docx.opts
        self.mi = mi
        self.cover_img = None
        p = self.opts.output_profile
        p.width_pts, p.height_pts = page_effective_area(self.opts)

    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace, self.log,
                                            self.mi.language)
        self.links_manager = LinksManager(self.docx.namespace,
                                          self.docx.document_relationships,
                                          self.log)
        self.images_manager = ImagesManager(self.oeb,
                                            self.docx.document_relationships,
                                            self.opts)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb,
                                          self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager,
                             self.links_manager)
        self.current_link = self.current_lang = None

        for item in self.oeb.spine:
            self.log.debug('Processing', item.href)
            self.process_item(item)
        if self.add_toc:
            self.links_manager.process_toc_links(self.oeb)

        if self.add_cover and self.oeb.metadata.cover and str(
                self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
            cover_id = str(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids[cover_id]
            self.cover_img = self.images_manager.read_image(item.href)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i + 1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True
        self.blocks.apply_page_break_after()
        self.blocks.resolve_language()

        if self.cover_img is not None:
            self.cover_img = self.images_manager.create_cover_markup(
                self.cover_img, self.opts.preserve_cover_aspect_ratio,
                *page_size(self.opts))
        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()

    def process_item(self, item):
        self.current_item = item
        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data,
                                item.href,
                                self.oeb,
                                self.opts,
                                profile=self.opts.output_profile,
                                base_css=self.base_css)
        self.abshref = self.images_manager.abshref = item.abshref

        self.current_lang = lang_for_tag(
            item.data) or self.styles_manager.document_lang
        for i, body in enumerate(XPath('//h:body')(item.data)):
            with self.blocks:
                self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(
                    self.links_manager.top_anchor, self.current_item, body)
                self.process_tag(body, stylizer, is_first_tag=i == 0)

    def process_tag(self,
                    html_tag,
                    stylizer,
                    is_first_tag=False,
                    float_spec=None):
        tagname = barename(html_tag.tag)
        tag_style = stylizer.style(html_tag)
        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'
                                          } or tag_style.is_hidden
        display = tag_style._get('display')
        is_block = False

        if not ignore_tag_contents:
            previous_link = self.current_link
            if tagname == 'a' and html_tag.get('href'):
                self.current_link = (self.current_item, html_tag.get('href'),
                                     html_tag.get('title'))
            previous_lang = self.current_lang
            tag_lang = lang_for_tag(html_tag)
            if tag_lang:
                self.current_lang = tag_lang

            is_float = tag_style['float'] in {'left', 'right'
                                              } and not is_first_tag
            if float_spec is None and is_float:
                float_spec = FloatSpec(self.docx.namespace, html_tag,
                                       tag_style)

            if display in {
                    'inline', 'inline-block'
            } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
                if is_float and float_spec.is_dropcaps:
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)
                    float_spec = None
                else:
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            elif display == 'list-item':
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_list_item=True)
            elif display.startswith('table') or display == 'inline-table':
                if display == 'table-cell':
                    self.blocks.start_new_cell(html_tag, tag_style)
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       is_table_cell=True)
                elif display == 'table-row':
                    self.blocks.start_new_row(html_tag, tag_style)
                elif display in {'table', 'inline-table'}:
                    self.blocks.end_current_block()
                    self.blocks.start_new_table(html_tag, tag_style)
            else:
                if tagname == 'img' and is_float:
                    # Image is floating so dont start a new paragraph for it
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
                else:
                    if tagname == 'hr':
                        for edge in 'right bottom left'.split():
                            tag_style.set('border-%s-style' % edge, 'none')
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)

            for child in html_tag.iterchildren():
                if isinstance(getattr(child, 'tag', None), string_or_bytes):
                    self.process_tag(child, stylizer, float_spec=float_spec)
                else:  # Comment/PI/etc.
                    tail = getattr(child, 'tail', None)
                    if tail:
                        block = self.create_block_from_parent(
                            html_tag, stylizer)
                        block.add_text(tail,
                                       tag_style,
                                       is_parent_style=False,
                                       link=self.current_link,
                                       lang=self.current_lang)

            is_block = html_tag in self.blocks.open_html_blocks
            self.blocks.finish_tag(html_tag)
            if is_block and tag_style['page-break-after'] == 'avoid':
                self.blocks.all_blocks[-1].keep_next = True

            self.current_link = previous_link
            self.current_lang = previous_lang

        # Now, process the tail if any

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.create_block_from_parent(html_tag, stylizer)
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True,
                           link=self.current_link,
                           lang=self.current_lang)

    def create_block_from_parent(self, html_tag, stylizer):
        parent = html_tag.getparent()
        block = self.blocks.current_or_new_block(parent,
                                                 stylizer.style(parent))
        # Do not inherit page-break-before from parent
        block.page_break_before = False
        return block

    def add_block_tag(self,
                      tagname,
                      html_tag,
                      tag_style,
                      stylizer,
                      is_table_cell=False,
                      float_spec=None,
                      is_list_item=False):
        block = self.blocks.start_new_block(html_tag,
                                            tag_style,
                                            is_table_cell=is_table_cell,
                                            float_spec=float_spec,
                                            is_list_item=is_list_item)
        anchor = html_tag.get('id') or html_tag.get('name')
        if anchor:
            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
        if tagname == 'img':
            self.images_manager.add_image(html_tag,
                                          block,
                                          stylizer,
                                          as_block=True)
        else:
            text = html_tag.text
            is_list_item = tagname == 'li'
            has_sublist = is_list_item and len(html_tag) and barename(
                html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0])
            if text and has_sublist and not text.strip():
                text = ''  # whitespace only, ignore
            if text:
                block.add_text(text,
                               tag_style,
                               ignore_leading_whitespace=True,
                               is_parent_style=True,
                               link=self.current_link,
                               lang=self.current_lang)
            elif has_sublist:
                block.force_not_empty = True

    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
        anchor = html_tag.get('id') or html_tag.get('name') or None
        bmark = None
        if anchor:
            bmark = self.bookmark_for_anchor(anchor, html_tag)
        if tagname == 'br':
            if html_tag.tail or html_tag is not tuple(
                    html_tag.getparent().iterchildren('*'))[-1]:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_break(clear={
                    'both': 'all',
                    'left': 'left',
                    'right': 'right'
                }.get(tag_style['clear'], 'none'),
                                bookmark=bmark)
        elif tagname == 'img':
            block = self.create_block_from_parent(html_tag, stylizer)
            self.images_manager.add_image(html_tag,
                                          block,
                                          stylizer,
                                          bookmark=bmark)
        else:
            if html_tag.text:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_text(html_tag.text,
                               tag_style,
                               is_parent_style=False,
                               bookmark=bmark,
                               link=self.current_link,
                               lang=self.current_lang)
            elif bmark:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_text('',
                               tag_style,
                               is_parent_style=False,
                               bookmark=bmark,
                               link=self.current_link,
                               lang=self.current_lang)

    def bookmark_for_anchor(self, anchor, html_tag):
        return self.links_manager.bookmark_for_anchor(anchor,
                                                      self.current_item,
                                                      html_tag)

    def write(self):
        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
        self.blocks.serialize(body)
        body.append(body[0])  # Move <sectPr> to the end
        if self.links_manager.toc:
            self.links_manager.serialize_toc(
                body, self.styles_manager.primary_heading_style)
        if self.cover_img is not None:
            self.images_manager.write_cover_block(body, self.cover_img)
        self.styles_manager.serialize(self.docx.styles)
        self.images_manager.serialize(self.docx.images)
        self.fonts_manager.serialize(self.styles_manager.text_styles,
                                     self.docx.font_table,
                                     self.docx.embedded_fonts, self.docx.fonts)
        self.lists_manager.serialize(self.docx.numbering)
Example #6
0
class Convert(object):

    # Word does not apply default styling to hyperlinks, so we ensure they get
    # default styling (the conversion pipeline does not apply any styling to
    # them).
    base_css = '''
    a[href] { text-decoration: underline; color: blue }
    '''

    def __init__(self, oeb, docx):
        self.oeb, self.docx = oeb, docx
        self.log, self.opts = docx.log, docx.opts

    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace)
        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships)
        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
        self.current_link = None

        for item in self.oeb.spine:
            self.process_item(item)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i+1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True

        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()

    def process_item(self, item):
        self.current_item = item
        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile, base_css=self.base_css)
        self.abshref = self.images_manager.abshref = item.abshref

        for i, body in enumerate(XPath('//h:body')(item.data)):
            with self.blocks:
                body.set('id', body.get('id', None) or self.links_manager.top_anchor)
                self.process_tag(body, stylizer, is_first_tag=i == 0)

    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return

        previous_link = self.current_link
        if tagname == 'a' and html_tag.get('href'):
            self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))

        display = tag_style._get('display')
        is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
        if float_spec is None and is_float:
            float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

        if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            if is_float and float_spec.is_dropcaps:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
                float_spec = None
            else:
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.end_current_block()
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and is_float:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer, float_spec=float_spec)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        self.current_link = previous_link

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link)

    def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
        block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
        anchor = html_tag.get('id') or html_tag.get('name')
        if anchor:
            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
        if tagname == 'img':
            self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
        else:
            if html_tag.text:
                block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link)

    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
        anchor = html_tag.get('id') or html_tag.get('name') or None
        bmark = None
        if anchor:
            bmark = self.bookmark_for_anchor(anchor, html_tag)
        if tagname == 'br':
            if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
                block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
                block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
        elif tagname == 'img':
            block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
            self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
        else:
            if html_tag.text:
                block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
                block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link)

    def bookmark_for_anchor(self, anchor, html_tag):
        return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)

    def write(self):
        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
        self.blocks.serialize(body)
        body.append(body[0])  # Move <sectPr> to the end
        self.styles_manager.serialize(self.docx.styles)
        self.images_manager.serialize(self.docx.images)
        self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
        self.lists_manager.serialize(self.docx.numbering)
Example #7
0
class Convert(object):

    # Word does not apply default styling to hyperlinks, so we ensure they get
    # default styling (the conversion pipeline does not apply any styling to
    # them).
    base_css = '''
    a[href] { text-decoration: underline; color: blue }
    '''

    def __init__(self, oeb, docx, mi, add_cover, add_toc):
        self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc
        self.log, self.opts = docx.log, docx.opts
        self.mi = mi
        self.cover_img = None
        p = self.opts.output_profile
        p.width_pts, p.height_pts = page_effective_area(self.opts)

    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
        self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
        self.current_link = self.current_lang = None

        for item in self.oeb.spine:
            self.log.debug('Processing', item.href)
            self.process_item(item)
        if self.add_toc:
            self.links_manager.process_toc_links(self.oeb)

        if self.add_cover and self.oeb.metadata.cover and unicode(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids:
            cover_id = unicode(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids[cover_id]
            self.cover_img = self.images_manager.read_image(item.href)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i+1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True
        self.blocks.apply_page_break_after()
        self.blocks.resolve_language()

        if self.cover_img is not None:
            self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts))
        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()

    def process_item(self, item):
        self.current_item = item
        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
        self.abshref = self.images_manager.abshref = item.abshref

        self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
        for i, body in enumerate(XPath('//h:body')(item.data)):
            with self.blocks:
                self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
                self.process_tag(body, stylizer, is_first_tag=i == 0)

    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
        tagname = barename(html_tag.tag)
        tag_style = stylizer.style(html_tag)
        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
        display = tag_style._get('display')
        is_block = False

        if not ignore_tag_contents:
            previous_link = self.current_link
            if tagname == 'a' and html_tag.get('href'):
                self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
            previous_lang = self.current_lang
            tag_lang = lang_for_tag(html_tag)
            if tag_lang:
                self.current_lang = tag_lang

            is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
            if float_spec is None and is_float:
                float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

            if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
                if is_float and float_spec.is_dropcaps:
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
                    float_spec = None
                else:
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            elif display == 'list-item':
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
            elif display.startswith('table') or display == 'inline-table':
                if display == 'table-cell':
                    self.blocks.start_new_cell(html_tag, tag_style)
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
                elif display == 'table-row':
                    self.blocks.start_new_row(html_tag, tag_style)
                elif display in {'table', 'inline-table'}:
                    self.blocks.end_current_block()
                    self.blocks.start_new_table(html_tag, tag_style)
            else:
                if tagname == 'img' and is_float:
                    # Image is floating so dont start a new paragraph for it
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
                else:
                    if tagname == 'hr':
                        for edge in 'right bottom left'.split():
                            tag_style.set('border-%s-style' % edge, 'none')
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)

            for child in html_tag.iterchildren():
                if isinstance(getattr(child, 'tag', None), basestring):
                    self.process_tag(child, stylizer, float_spec=float_spec)
                else:  # Comment/PI/etc.
                    tail = getattr(child, 'tail', None)
                    if tail:
                        block = self.create_block_from_parent(html_tag, stylizer)
                        block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)

            is_block = html_tag in self.blocks.open_html_blocks
            self.blocks.finish_tag(html_tag)
            if is_block and tag_style['page-break-after'] == 'avoid':
                self.blocks.all_blocks[-1].keep_next = True

            self.current_link = previous_link
            self.current_lang = previous_lang

        # Now, process the tail if any

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.create_block_from_parent(html_tag, stylizer)
            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)

    def create_block_from_parent(self, html_tag, stylizer):
        parent = html_tag.getparent()
        block = self.blocks.current_or_new_block(parent, stylizer.style(parent))
        # Do not inherit page-break-before from parent
        block.page_break_before = False
        return block

    def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False):
        block = self.blocks.start_new_block(
            html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item)
        anchor = html_tag.get('id') or html_tag.get('name')
        if anchor:
            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
        if tagname == 'img':
            self.images_manager.add_image(html_tag, block, stylizer, as_block=True)
        else:
            if html_tag.text:
                block.add_text(html_tag.text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)

    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
        anchor = html_tag.get('id') or html_tag.get('name') or None
        bmark = None
        if anchor:
            bmark = self.bookmark_for_anchor(anchor, html_tag)
        if tagname == 'br':
            if html_tag.tail or html_tag is not tuple(html_tag.getparent().iterchildren('*'))[-1]:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(tag_style['clear'], 'none'), bookmark=bmark)
        elif tagname == 'img':
            block = self.create_block_from_parent(html_tag, stylizer)
            self.images_manager.add_image(html_tag, block, stylizer, bookmark=bmark)
        else:
            if html_tag.text:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_text(html_tag.text, tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)
            elif bmark:
                block = self.create_block_from_parent(html_tag, stylizer)
                block.add_text('', tag_style, is_parent_style=False, bookmark=bmark, link=self.current_link, lang=self.current_lang)

    def bookmark_for_anchor(self, anchor, html_tag):
        return self.links_manager.bookmark_for_anchor(anchor, self.current_item, html_tag)

    def write(self):
        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
        self.blocks.serialize(body)
        body.append(body[0])  # Move <sectPr> to the end
        if self.links_manager.toc:
            self.links_manager.serialize_toc(body, self.styles_manager.primary_heading_style)
        if self.cover_img is not None:
            self.images_manager.write_cover_block(body, self.cover_img)
        self.styles_manager.serialize(self.docx.styles)
        self.images_manager.serialize(self.docx.images)
        self.fonts_manager.serialize(self.styles_manager.text_styles, self.docx.font_table, self.docx.embedded_fonts, self.docx.fonts)
        self.lists_manager.serialize(self.docx.numbering)
Example #8
0
class Convert(object):

    # Word does not apply default styling to hyperlinks, so we ensure they get
    # default styling (the conversion pipeline does not apply any styling to
    # them).
    base_css = '''
    a[href] { text-decoration: underline; color: blue }
    '''

    def __init__(self, oeb, docx):
        self.oeb, self.docx = oeb, docx
        self.log, self.opts = docx.log, docx.opts

    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
        self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager(self.docx.namespace)
        self.links_manager = LinksManager(self.docx.namespace,
                                          self.docx.document_relationships)
        self.images_manager = ImagesManager(self.oeb,
                                            self.docx.document_relationships)
        self.lists_manager = ListsManager(self.docx)
        self.fonts_manager = FontsManager(self.docx.namespace, self.oeb,
                                          self.opts)
        self.blocks = Blocks(self.docx.namespace, self.styles_manager,
                             self.links_manager)
        self.current_link = None

        for item in self.oeb.spine:
            self.process_item(item)

        all_blocks = self.blocks.all_blocks
        remove_blocks = []
        for i, block in enumerate(all_blocks):
            try:
                nb = all_blocks[i + 1]
            except IndexError:
                break
            block.resolve_skipped(nb)
            if block.skipped:
                remove_blocks.append((i, block))
        for pos, block in reversed(remove_blocks):
            self.blocks.delete_block_at(pos)
        self.blocks.all_blocks[0].is_first_block = True

        self.lists_manager.finalize(all_blocks)
        self.styles_manager.finalize(all_blocks)
        self.write()

    def process_item(self, item):
        self.current_item = item
        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data,
                                item.href,
                                self.oeb,
                                self.opts,
                                self.opts.output_profile,
                                base_css=self.base_css)
        self.abshref = self.images_manager.abshref = item.abshref

        for i, body in enumerate(XPath('//h:body')(item.data)):
            with self.blocks:
                body.set('id',
                         body.get('id', None) or self.links_manager.top_anchor)
                self.process_tag(body, stylizer, is_first_tag=i == 0)

    def process_tag(self,
                    html_tag,
                    stylizer,
                    is_first_tag=False,
                    float_spec=None):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return

        previous_link = self.current_link
        if tagname == 'a' and html_tag.get('href'):
            self.current_link = (self.current_item, html_tag.get('href'),
                                 html_tag.get('title'))

        display = tag_style._get('display')
        is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
        if float_spec is None and is_float:
            float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

        if display in {
                'inline', 'inline-block'
        } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            if is_float and float_spec.is_dropcaps:
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   float_spec=float_spec)
                float_spec = None
            else:
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            self.add_block_tag(tagname,
                               html_tag,
                               tag_style,
                               stylizer,
                               is_list_item=True)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.end_current_block()
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and is_float:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   float_spec=float_spec)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer, float_spec=float_spec)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        self.current_link = previous_link

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(
                html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True,
                           link=self.current_link)

    def add_block_tag(self,
                      tagname,
                      html_tag,
                      tag_style,
                      stylizer,
                      is_table_cell=False,
                      float_spec=None,
                      is_list_item=False):
        block = self.blocks.start_new_block(html_tag,
                                            tag_style,
                                            is_table_cell=is_table_cell,
                                            float_spec=float_spec,
                                            is_list_item=is_list_item)
        anchor = html_tag.get('id') or html_tag.get('name')
        if anchor:
            block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
        if tagname == 'img':
            self.images_manager.add_image(html_tag,
                                          block,
                                          stylizer,
                                          as_block=True)
        else:
            if html_tag.text:
                block.add_text(html_tag.text,
                               tag_style,
                               ignore_leading_whitespace=True,
                               is_parent_style=True,
                               link=self.current_link)

    def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
        anchor = html_tag.get('id') or html_tag.get('name') or None
        bmark = None
        if anchor:
            bmark = self.bookmark_for_anchor(anchor, html_tag)
        if tagname == 'br':
            if html_tag.tail or html_tag is not tuple(
                    html_tag.getparent().iterchildren('*'))[-1]:
                block = self.blocks.current_or_new_block(
                    html_tag.getparent(), stylizer.style(html_tag.getparent()))
                block.add_break(clear={
                    'both': 'all',
                    'left': 'left',
                    'right': 'right'
                }.get(tag_style['clear'], 'none'),
                                bookmark=bmark)
        elif tagname == 'img':
            block = self.blocks.current_or_new_block(
                html_tag.getparent(), stylizer.style(html_tag.getparent()))
            self.images_manager.add_image(html_tag,
                                          block,
                                          stylizer,
                                          bookmark=bmark)
        else:
            if html_tag.text:
                block = self.blocks.current_or_new_block(
                    html_tag.getparent(), stylizer.style(html_tag.getparent()))
                block.add_text(html_tag.text,
                               tag_style,
                               is_parent_style=False,
                               bookmark=bmark,
                               link=self.current_link)

    def bookmark_for_anchor(self, anchor, html_tag):
        return self.links_manager.bookmark_for_anchor(anchor,
                                                      self.current_item,
                                                      html_tag)

    def write(self):
        self.docx.document, self.docx.styles, body = create_skeleton(self.opts)
        self.blocks.serialize(body)
        body.append(body[0])  # Move <sectPr> to the end
        self.styles_manager.serialize(self.docx.styles)
        self.images_manager.serialize(self.docx.images)
        self.fonts_manager.serialize(self.styles_manager.text_styles,
                                     self.docx.font_table,
                                     self.docx.embedded_fonts, self.docx.fonts)
        self.lists_manager.serialize(self.docx.numbering)