def replace_asset_hrefs(soup: PageElement, base_url: str) -> PageElement: """makes all relative asset links absolute""" for link in soup.find_all('link', href=True): link['href'] = abs_asset_href(link['href'], base_url) for asset in soup.find_all(src=True): asset['src'] = abs_asset_href(asset['src'], base_url) return soup
def convert_for_two_columns(soup: PageElement, level: int, logger: Logger = None): if level == 0: return elif level != 3: if logger: logger.warning('`two_columns_level` is only support `3` yet.') return if logger: logger.info('Converting for two-column layout(heading level 3).') ignored = [] for el in soup.find_all('h3'): if el in ignored: continue els = [ i for i in itertools.takewhile( lambda x: x.name not in ['h1', 'h2'], el.next_siblings) ] section = soup.new_tag('section', **{'class': 'md-typeset two-columns'}) el.wrap(section) for tag in els: section.append(tag) if tag.name == 'h3': ignored.append(tag) images_size_to_half_in(section)
def get_combined(soup: PageElement, base_url: str, rel_url: str) -> PageElement: """ transforms all relative hrefs pointing to other html docs into relative pdf hrefs """ for element in soup.find_all(id=True): element['id'] = transform_id(element['id'], rel_url) for a in soup.find_all('a', href=True): if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): continue a['href'] = transform_href(a['href'], rel_url) soup.body['id'] = get_body_id(rel_url) soup = replace_asset_hrefs(soup, base_url) return soup
def convert_iframe(soup: PageElement, entries: list, logger: Logger = None): """Replace iFrame to a(anchor) e.g: ```html "before:" <iframe frameborder="0" height="100%" src="SRC"/> ``` ```html "after:" <a class="converted-iframe" href="SRC" target="_blank"> <img src="POSTER IMAGE"/> </a> ``` """ if len(entries) < 1: return if logger: logger.info('Converting <iframe> to poster image(if available).') for iframe in soup.find_all('iframe', src=True): for entry in entries: if iframe['src'] != entry.get('src'): continue a = soup.new_tag('a', href=iframe['src'], target='_blank', **{'class': 'converted-iframe'}) img_src = entry.get('img') if img_src: a.append(soup.new_tag('img', src=img_src)) text = entry.get('text') if text: span = soup.new_tag('span') span.string = text a.append(span) # copy attributes for key, val in iframe.attrs.items(): if key in ['style']: a[key] = val iframe.replace_with(a)
def _remove_empty_tags(self, soup: PageElement): def is_blank(el): if len(el.get_text(strip=True)) != 0: return False elif el.find(['img', 'svg']): return False else: return True includes = ['article', 'p'] while True: hit = False for x in soup.find_all(): if x.name in includes and is_blank(x): # self.logger.debug(f'Strip: {x}') x.extract() hit = True if not hit: break
def get_html_table_header_and_rows( table: bs4.PageElement) -> Tuple[List, List]: """ return header and rows from a html table as a list """ header = [] rows = [] table_header = table.find("tr") table_rows = table.find_all("tr")[1:] for items in table_header: header.append(items.get_text()) for table_row in table_rows: row = [] for cell in table_row.findAll(['th', 'td']): row.append(cell) rows.append(row) return header, rows
def make_indexes(soup: PageElement, options: Options) -> None: """ Generate ordered chapter number and TOC of document. Arguments: soup {BeautifulSoup} -- DOM object of Document. options {Options} -- The options of this sequence. """ # Step 1: (re)ordered headdings _inject_heading_order(soup, options) # Step 2: generate toc page level = options.toc_level if level < 1 or level > 3: return options.logger.info( f'Generate a table of contents up to heading level {level}.') h1li = None h2ul = h2li = h3ul = None exclude_lv2 = exclude_lv3 = False def makeLink(h: Tag) -> Tag: li = soup.new_tag('li') ref = h.get('id', '') a = soup.new_tag('a', href=f'#{ref}') for el in h.contents: if el.name == 'a': a.append(el.contents[0]) else: a.append(clone_element(el)) li.append(a) options.logger.debug(f"| [{h.get_text(separator=' ')}]({ref})") return li toc = soup.new_tag('article', id='doc-toc') title = soup.new_tag('h1') title.append(options.toc_title) toc.append(title) h1ul = soup.new_tag('ul') toc.append(h1ul) headings = soup.find_all(['h1', 'h2', 'h3']) for h in headings: if h.name == 'h1': h1li = makeLink(h) h1ul.append(h1li) h2ul = h2li = h3ul = None exclude_lv2 = _is_exclude(h.get('id', None), options) elif not exclude_lv2 and h.name == 'h2' and level >= 2: if not h2ul: h2ul = soup.new_tag('ul') h1li.append(h2ul) h2li = makeLink(h) h2ul.append(h2li) h3ul = None exclude_lv3 = _is_exclude(h.get('id', None), options) elif not exclude_lv2 and not exclude_lv3 \ and h.name == 'h3' and level >= 3: if not h2li: continue if not h3ul: h3ul = soup.new_tag('ul') h2li.append(h3ul) h3li = makeLink(h) h3ul.append(h3li) else: continue pass soup.body.insert(0, toc)
def get_separate(soup: PageElement, base_url: str) -> PageElement: for a in soup.find_all('a', href=True): a['href'] = rel_pdf_href(a['href']) soup = replace_asset_hrefs(soup, base_url) return soup