Example #1
0
    def process_tag(self,
                    html_tag,
                    stylizer,
                    is_first_tag=False,
                    float_spec=None):
        tagname = barename(html_tag.tag)
        tag_style = stylizer.style(html_tag)
        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'
                                          } or tag_style.is_hidden
        display = tag_style._get('display')
        is_block = False

        if not ignore_tag_contents:
            previous_link = self.current_link
            if tagname == 'a' and html_tag.get('href'):
                self.current_link = (self.current_item, html_tag.get('href'),
                                     html_tag.get('title'))
            previous_lang = self.current_lang
            tag_lang = lang_for_tag(html_tag)
            if tag_lang:
                self.current_lang = tag_lang

            is_float = tag_style['float'] in {'left', 'right'
                                              } and not is_first_tag
            if float_spec is None and is_float:
                float_spec = FloatSpec(self.docx.namespace, html_tag,
                                       tag_style)

            if display in {
                    'inline', 'inline-block'
            } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
                if is_float and float_spec.is_dropcaps:
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)
                    float_spec = None
                else:
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            elif display == 'list-item':
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_list_item=True)
            elif display.startswith('table') or display == 'inline-table':
                if display == 'table-cell':
                    self.blocks.start_new_cell(html_tag, tag_style)
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       is_table_cell=True)
                elif display == 'table-row':
                    self.blocks.start_new_row(html_tag, tag_style)
                elif display in {'table', 'inline-table'}:
                    self.blocks.end_current_block()
                    self.blocks.start_new_table(html_tag, tag_style)
            else:
                if tagname == 'img' and is_float:
                    # Image is floating so dont start a new paragraph for it
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
                else:
                    if tagname == 'hr':
                        for edge in 'right bottom left'.split():
                            tag_style.set('border-%s-style' % edge, 'none')
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)

            for child in html_tag.iterchildren():
                if isinstance(getattr(child, 'tag', None), string_or_bytes):
                    self.process_tag(child, stylizer, float_spec=float_spec)
                else:  # Comment/PI/etc.
                    tail = getattr(child, 'tail', None)
                    if tail:
                        block = self.create_block_from_parent(
                            html_tag, stylizer)
                        block.add_text(tail,
                                       tag_style,
                                       is_parent_style=False,
                                       link=self.current_link,
                                       lang=self.current_lang)

            is_block = html_tag in self.blocks.open_html_blocks
            self.blocks.finish_tag(html_tag)
            if is_block and tag_style['page-break-after'] == 'avoid':
                self.blocks.all_blocks[-1].keep_next = True

            self.current_link = previous_link
            self.current_lang = previous_lang

        # Now, process the tail if any

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.create_block_from_parent(html_tag, stylizer)
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True,
                           link=self.current_link,
                           lang=self.current_lang)
Example #2
0
    def process_tag(self,
                    html_tag,
                    stylizer,
                    is_first_tag=False,
                    float_spec=None):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return

        previous_link = self.current_link
        if tagname == 'a' and html_tag.get('href'):
            self.current_link = (self.current_item, html_tag.get('href'),
                                 html_tag.get('title'))

        display = tag_style._get('display')
        is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
        if float_spec is None and is_float:
            float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

        if display in {
                'inline', 'inline-block'
        } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            if is_float and float_spec.is_dropcaps:
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   float_spec=float_spec)
                float_spec = None
            else:
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            self.add_block_tag(tagname,
                               html_tag,
                               tag_style,
                               stylizer,
                               is_list_item=True)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.end_current_block()
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and is_float:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   float_spec=float_spec)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer, float_spec=float_spec)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        self.current_link = previous_link

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(
                html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True,
                           link=self.current_link)