Esempio n. 1
0
    def replace_any_case(element: NavigableString, target_word: str) -> None:
        # Replace all instances of the word, but maintaining the same case in
        # the replacement
        if len(element) == len(target_word):
            return

        if not re.match('.*[a-zA-Z0-9].*', target_word) or (
                element.parent and element.parent.name == 'style'):
            return

        element.replace_with(BeautifulSoup(
            re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
                   r'<b>\1</b>',
                   html.escape(element),
                   flags=re.I), 'html.parser')
        )
Esempio n. 2
0
def replace_text_with_tag(sub_text: str, repl_tag: Tag,
                          text_tag: bs4.NavigableString,
                          article: Article) -> bs4.NavigableString:
    #if we can't find the parent, assume it's just the document
    parent: Tag
    if text_tag.parent == None or text_tag.parent.name == '[document]':
        parent = article.content
    else:
        parent = text_tag.parent
    tag_idx = parent.contents.index(text_tag)
    #replace the matched text with a tag
    begin, *rest = text_tag.split(sub_text, maxsplit=1)
    end: str
    if len(rest):
        end = rest[0]
    else:
        end = ""
    #convert these strings to tags
    begin = bs4.NavigableString(begin)
    end = bs4.NavigableString(end)
    text_tag.replace_with(begin)
    parent.insert(tag_idx + 1, repl_tag)
    parent.insert(tag_idx + 2, end)
    return end
Esempio n. 3
0
def highlight(html, highlights):
    """Highlight part of an HTML documents.

    :param highlights: Iterable of (start, end) pairs, which are computed over
        UTF-8 bytes and don't count HTML tags
    """
    highlights = iter(highlights)
    soup = BeautifulSoup(html, 'html5lib')

    pos = 0
    node = soup
    highlighting = False
    try:
        start, end = next(highlights)
        while True:
            if getattr(node, 'contents', None):
                node = node.contents[0]
            else:
                if isinstance(node, NavigableString):
                    nb = len(node.string.encode('utf-8'))
                    while True:
                        if not highlighting and start == pos:
                            highlighting = True
                        elif not highlighting and pos + nb > start:
                            parent = node.parent
                            left = node.string[:start - pos]
                            right = node.string[start - pos:]
                            idx = parent.index(node)
                            node.replace_with(NavigableString(left))
                            node = NavigableString(right)
                            parent.insert(idx + 1, node)
                            nb -= start - pos
                            pos = start
                            # Code below will do the actual highlighting
                            highlighting = True
                        elif highlighting and pos + nb <= end:
                            newnode = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            node.replace_with(newnode)
                            newnode.append(node)
                            node = newnode
                            if pos + nb == end:
                                highlighting = False
                                start, end = next(highlights)
                            break
                        elif highlighting:
                            parent = node.parent
                            left = node.string[:end - pos]
                            rest = node.string[end - pos:]
                            idx = parent.index(node)
                            newnode = NavigableString(left)
                            node.replace_with(newnode)
                            node = newnode
                            newnode = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            node.replace_with(newnode)
                            newnode.append(node)
                            node = NavigableString(rest)
                            parent.insert(idx + 1, node)
                            nb -= end - pos
                            pos = end
                            highlighting = False
                            start, end = next(highlights)
                        else:
                            break

                    pos += nb
                while not node.next_sibling:
                    if not node.parent:
                        raise StopIteration
                    node = node.parent
                node = node.next_sibling
    except StopIteration:
        # Remove everything but body
        body = soup.body
        soup.clear()
        soup.append(body)

        # Remove the body tag itself to only have the contents
        soup.body.unwrap()

        # Back to text
        return str(soup)