Esempio n. 1
0
    def __init__(self,
                 html_tree,
                 display_images=False,
                 deduplicate_captions=False,
                 display_links=False,
                 css=None):
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions
        self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = []  # attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()
Esempio n. 2
0
    def start_td(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            # open td tag
            self.clean_text_lines.append([])
            self.current_line.append(Line())
            self.next_line.append(Line())
            self.current_table[-1].add_cell(self.clean_text_lines[-1])
            self.current_table[-1].td_is_open = True
    def __init__(self, html_tree, config=None):
        # use the default configuration, if no config object is provided
        self.config = config or ParserConfig()

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self._start_table,
            'tr': self._start_tr,
            'td': self._start_td,
            'th': self._start_td,
            'ul': self._start_ul,
            'ol': self._start_ol,
            'li': self._start_li,
            'br': self._newline,
            'a': self._start_a if self.config.parse_a() else None,
            'img': self._start_img if self.config.display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self._end_table,
            'ul': self._end_ul,
            'ol': self._end_ol,
            'td': self._end_td,
            'th': self._end_td,
            'a': self._end_a if self.config.parse_a() else None,
        }

        # instance variables
        self.current_tag = [self.config.css['body']]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self._parse_html_tree(html_tree)
        if self.current_line[-1]:
            self._write_line()
Esempio n. 4
0
    def write_line(self, force=False):
        '''
        Writes the current line to the buffer, provided that there is any
        data to write.

        Returns:
          bool -- True, if a line has been writer, otherwise False.
        '''
        # only break the line if there is any relevant content
        if not force and (not self.current_line[-1].content
                          or self.current_line[-1].content.isspace()):
            self.current_line[-1].margin_before = \
                max(self.current_line[-1].margin_before,
                    self.current_tag[-1].margin_before)
            return False

        line = self.current_line[-1].get_text()
        self.clean_text_lines[-1].append(line)
        self.current_line[-1] = self.next_line[-1]
        self.next_line[-1] = Line()
        return True
Esempio n. 5
0
def test_cell_formatting():
    # standard line
    line = Line()
    line.margin_before = 0
    line.margin_after = 0
    line.prefix = ''
    line.suffix = ''
    line.content = 'Ehre sei Gott!'
    line.list_bullet = ''
    line.padding = 0

    assert line.get_text() == 'Ehre sei Gott!'
    # string representation
    assert str(line) == \
        "<Line: 'Ehre sei Gott!'>"

    # add margins
    line.margin_before = 1
    line.margin_after = 2
    assert line.get_text() == '\nEhre sei Gott!\n\n'

    # list bullet without padding
    line.list_bullet = "* "
    assert line.get_text() == '\n* Ehre sei Gott!\n\n'

    # add a padding
    line.padding = 3
    assert line.get_text() == '\n * Ehre sei Gott!\n\n'

    # and prefixes + suffixes
    line.prefix = '>>'
    line.suffix = '<<'
    assert line.get_text() == '\n * >>Ehre sei Gott!<<\n\n'