예제 #1
0
def highlight(html, highlights, show_tags=False):
    """Highlight part of an HTML documents.

    :param highlights: Iterable of (start, end, tags) triples, which are
        computed over UTF-8 bytes and don't count HTML tags
    :param show_tags: Whether to show the tag names within brackets after each
        highlight
    """
    # Build a list of starting points and ending points
    starts = []
    ends = []
    for hl in highlights:
        starts.append((hl[0], 'start', []))
        if len(hl) == 2:
            ends.append((hl[1], 'end', []))
        else:
            ends.append((hl[1], 'end', hl[2]))
    # This relies on the fact that 'end' < 'start'
    events = sorted(ends + starts)

    events = iter(events)
    soup = BeautifulSoup(html, 'html5lib')

    pos = 0
    node = soup
    highlighting = 0
    try:
        event_pos, event_type, tags = next(events)
    except StopIteration:
        event_pos = event_type = tags = None

    while node is not None:
        if getattr(node, 'contents', None):
            # Move down
            node = node.contents[0]
            continue

        if isinstance(node, NavigableString):
            # Move through text
            nb = len(node.string.encode('utf-8'))
            while event_pos is not None:
                if event_pos == pos and event_type == 'start':
                    # Start highlighting at beginning of text node
                    highlighting += 1
                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif pos + nb > event_pos:
                    # Next event falls inside of this text node
                    if event_type == 'start' and highlighting:
                        # Keep highlighting (can't highlight *more*)
                        highlighting += 1
                    elif (
                        event_type == 'end'
                        and not show_tags
                        and highlighting > 1
                    ):
                        # Keep highlighting (no need to put labels)
                        highlighting -= 1
                    else:  # 'end' and (show_tags or highlighting becomes 0)
                        # Split it
                        char_idx = byte_to_str_index(
                            node.string,
                            event_pos - pos,
                        )
                        left = node.string[:char_idx]
                        right = node.string[char_idx:]

                        # Left part
                        newnode = NavigableString(left)
                        if highlighting:
                            # Optionally highlight left part
                            span = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            span.append(newnode)
                            newnode = span
                        node.replace_with(newnode)
                        node = newnode

                        if event_type == 'start':
                            highlighting += 1
                        else:
                            highlighting -= 1
                            if show_tags:
                                # Add tag labels
                                comment = soup.new_tag(
                                    'span',
                                    attrs={'class': 'taglist'},
                                )
                                comment.string = ' [%s]' % ', '.join(tags)
                                node.insert_after(comment)
                                node = comment

                        # Right part
                        newnode = NavigableString(right)
                        node.insert_after(newnode)
                        node = newnode
                        nb -= event_pos - pos
                        pos = event_pos
                        # Next loop will highlight right part if needed

                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif highlighting:  # and pos + nb <= event_pos:
                    # Highlight whole text node
                    newnode = soup.new_tag(
                        'span',
                        attrs={'class': 'highlight'},
                    )
                    node.replace_with(newnode)
                    newnode.append(node)
                    node = newnode
                    if pos + nb == event_pos and event_type == 'end':
                        if show_tags:
                            comment = soup.new_tag(
                                'span',
                                attrs={'class': 'taglist'},
                            )
                            comment.string = ' [%s]' % ', '.join(tags)
                            newnode.insert_after(comment)
                            node = comment
                        highlighting -= 1
                        try:
                            event_pos, event_type, tags = next(events)
                        except StopIteration:
                            event_pos = None
                    break
                else:  # not highlighting and pos + nb <= event_pos
                    # Skip whole text node
                    break

            pos += nb

        # Move up until there's a sibling
        while not node.next_sibling and node.parent:
            node = node.parent
        if not node.parent:
            break
        # Move to next node
        node = node.next_sibling

    # Remove everything but body
    body = soup.body
    soup.clear()
    soup.append(body)

    # Remove the body tag itself to only have the contents
    soup.body.unwrap()

    # Back to text
    return str(soup)
예제 #2
0
파일: extract.py 프로젝트: staeiou/taguette
def highlight(html, highlights):
    """Highlight part of an HTML documents.

    :param highlights: Iterable of (start, end) pairs, which are computed over
        UTF-8 bytes and don't count HTML tags
    """
    highlights = iter(highlights)
    soup = BeautifulSoup(html, 'html5lib')

    pos = 0
    node = soup
    highlighting = False
    try:
        start, end = next(highlights)
        while True:
            if getattr(node, 'contents', None):
                node = node.contents[0]
            else:
                if isinstance(node, NavigableString):
                    nb = len(node.string.encode('utf-8'))
                    while True:
                        if not highlighting and start == pos:
                            highlighting = True
                        elif not highlighting and pos + nb > start:
                            parent = node.parent
                            left = node.string[:start - pos]
                            right = node.string[start - pos:]
                            idx = parent.index(node)
                            node.replace_with(NavigableString(left))
                            node = NavigableString(right)
                            parent.insert(idx + 1, node)
                            nb -= start - pos
                            pos = start
                            # Code below will do the actual highlighting
                            highlighting = True
                        elif highlighting and pos + nb <= end:
                            newnode = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            node.replace_with(newnode)
                            newnode.append(node)
                            node = newnode
                            if pos + nb == end:
                                highlighting = False
                                start, end = next(highlights)
                            break
                        elif highlighting:
                            parent = node.parent
                            left = node.string[:end - pos]
                            rest = node.string[end - pos:]
                            idx = parent.index(node)
                            newnode = NavigableString(left)
                            node.replace_with(newnode)
                            node = newnode
                            newnode = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            node.replace_with(newnode)
                            newnode.append(node)
                            node = NavigableString(rest)
                            parent.insert(idx + 1, node)
                            nb -= end - pos
                            pos = end
                            highlighting = False
                            start, end = next(highlights)
                        else:
                            break

                    pos += nb
                while not node.next_sibling:
                    if not node.parent:
                        raise StopIteration
                    node = node.parent
                node = node.next_sibling
    except StopIteration:
        # Remove everything but body
        body = soup.body
        soup.clear()
        soup.append(body)

        # Remove the body tag itself to only have the contents
        soup.body.unwrap()

        # Back to text
        return str(soup)