def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False): ''' ::param: display_images \ whether to include image tiles/alt texts ::param: deduplicate_captions \ whether to deduplicate captions such as image titles (many newspaper include images and video previews with identifical titles). ::param: display_links \ whether to display link targets (e.g. `[Python](https://www.python.org)`) ''' # setup config self.cfg_deduplicate_captions = deduplicate_captions # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self.start_table, 'tr': self.start_tr, 'td': self.start_td, 'th': self.start_td, 'ul': self.start_ul, 'ol': self.start_ol, 'li': self.start_li, 'br': self.newline, 'a': self.start_a if display_links else None, 'img': self.start_img if display_images else None, } self.end_tag_handler_dict = { 'table': self.end_table, 'ul': self.end_ul, 'ol': self.end_ol, 'td': self.end_td, 'th': self.end_td, 'a': self.end_a if display_links else None, } # instance variables self.current_tag = [HtmlElement()] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.invisible = [] # a list of attributes that are considered invisible self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self.crawl_tree(html_tree) if self.current_line[-1]: self.write_line()
def start_td(self, attrs): if self.current_table: # check whether we need to cleanup a <td> tag that has not been # closed yet if self.current_table[-1].td_is_open: self.end_td() # open td tag self.clean_text_lines.append([]) self.current_line.append(Line()) self.next_line.append(Line()) self.current_table[-1].add_cell(self.clean_text_lines[-1]) self.current_table[-1].td_is_open = True
def write_line(self, force=False): ''' Writes the current line to the buffer, provided that there is any data to write. ::returns: True, if a line has been writer, otherwise False ''' # only break the line if there is any relevant content if not force and (not self.current_line[-1].content or self.current_line[-1].content.isspace()): self.current_line[-1].margin_before = max(self.current_line[-1].margin_before, self.current_tag[-1].margin_before) return False line = self.current_line[-1].get_text() self.clean_text_lines[-1].append(line) self.current_line[-1] = self.next_line[-1] self.next_line[-1] = Line() return True