Esempio n. 1
0
async def taskTx(sock, message, mtype):  # a poor implementation of an output coroutine.
    global revertProtocol
    tp = html5lib.getTreeBuilder("dom")
    p = html5lib.HTMLParser(tree=tp)
    tw = html5lib.getTreeWalker("dom")
    parsedTX = p.parseFragment(message)
    cleanTX = sanitizer.Filter(tw(parsedTX))
    s = html5lib.serializer.HTMLSerializer()
    pretx = s.serialize(cleanTX)
    tx = ''
    for item in pretx:
        tx += item
    if message == b"200":
        await sock.send("Goodbye.")
        await sock.close()
        return
    if message == b"202":
        await sock.send("Authentication Successful, you are now the admin terminal.")
    else:
        if revertProtocol:
            await sock.send(tx)
            return
        else:
            await sock.send(json.dumps({"MSG_TYPE":mtype, "MSG":tx}))
            return
Esempio n. 2
0
def sanitize_html(html):
    """
    Make the given HTML string safe to display in a Yarrharr page.
    """
    tree = html5lib.parseFragment(html)
    serializer = html5lib.serializer.HTMLSerializer()
    source = html5lib.getTreeWalker("etree")(tree)
    source = _strip_attrs(source)
    source = _drop_empty_tags(source)
    source = _ReplaceObjectFilter(source)
    source = _ElideFilter(source)
    source = _ReplaceYoutubeEmbedFilter(source)
    source = _ExtractTitleTextFilter(source)
    source = _adjust_links(source)
    source = _video_attrs(source)
    source = _wp_smileys(source)
    source = sanitizer.Filter(
        source,
        allowed_elements=sanitizer.allowed_elements
        | frozenset([
            (
                namespaces["html"],
                "summary",
            ),  # https://github.com/html5lib/html5lib-python/pull/423
            (
                namespaces["html"],
                "wbr",
            ),  # https://github.com/html5lib/html5lib-python/pull/395
        ]),
    )
    return serializer.render(source)
Esempio n. 3
0
def strip_style_and_script(input):
    dom = html5lib.parseFragment(input, treebuilder="dom")
    walker = html5lib.getTreeWalker("dom")
    stream = walker(dom)

    s = html5lib.serializer.HTMLSerializer()

    return s.render(NoChildTagFilter(stream, ("script", "style")))
Esempio n. 4
0
 def run(self, text):
     parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
     parsed = parser.parse(text)
     serializer = html5lib.serializer.HTMLSerializer()
     walker = html5lib.getTreeWalker("etree")
     stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
     out = ''.join(serializer.serialize(stream))
     return out
Esempio n. 5
0
 def run(self, text):
     parser = html5lib.HTMLParser(tokenizer=ForgeHTMLSanitizer)
     parsed = parser.parse(text)
     serializer = html5lib.serializer.HTMLSerializer()
     walker = html5lib.getTreeWalker("etree")
     stream = html5lib.filters.alphabeticalattributes.Filter(walker(parsed))
     out = ''.join(serializer.serialize(stream))
     return out
Esempio n. 6
0
def test_htmlserialized ():
    document = html5lib.parse (StringIO ("""<html><body>
<p>Hello &amp; <!-- comment -->W&#xf6;rld! &clubs; &Backslash;</p>
</body></html>"""))
    walker = html5lib.getTreeWalker("etree")
    stream = walker (document)
    s = HTMLSerializer()
    assert ''.join (s.serialize(stream)) == ' Hello & Wörld! ♣ \u2216\n\n '
Esempio n. 7
0
def html_tree_to_text(html_tree):
    options = {'quote_attr_values': 'always',
               'use_trailing_solidus': True,
               'space_before_trailing_solidus': True}
    serializer = html5lib.serializer.HTMLSerializer(**options)
    walker = html5lib.getTreeWalker('etree')
    stream = serializer.serialize(walker(html_tree))
    return u''.join(stream)
Esempio n. 8
0
    def __init__(self,
                 tags=ALLOWED_TAGS,
                 attributes=ALLOWED_ATTRIBUTES,
                 styles=ALLOWED_STYLES,
                 protocols=ALLOWED_PROTOCOLS,
                 strip=False,
                 strip_comments=True,
                 filters=None):
        """Initializes a Cleaner

        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``

        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

        :arg bool strip: whether or not to strip disallowed elements

        :arg bool strip_comments: whether or not to strip HTML comments

        :arg list filters: list of html5lib Filter classes to pass streamed content through

            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters

            .. Warning::

               Using filters changes the output of ``bleach.Cleaner.clean``.
               Make sure the way the filters change the output are secure.

        """
        self.tags = tags
        self.attributes = attributes
        self.styles = styles
        self.protocols = protocols
        self.strip = strip
        self.strip_comments = strip_comments
        self.filters = filters or []

        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
        self.walker = html5lib.getTreeWalker('etree')
        self.serializer = HTMLSerializer(
            quote_attr_values='always',
            omit_optional_tags=False,

            # Bleach has its own sanitizer, so don't use the html5lib one
            sanitize=False,

            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
            alphabetical_attributes=False,
        )
Esempio n. 9
0
    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
                 strip_comments=True, filters=None):
        """Initializes a Cleaner

        :arg list tags: allowed list of tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

        :arg list styles: allowed list of css styles; defaults to
            ``bleach.sanitizer.ALLOWED_STYLES``

        :arg list protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

        :arg bool strip: whether or not to strip disallowed elements

        :arg bool strip_comments: whether or not to strip HTML comments

        :arg list filters: list of html5lib Filter classes to pass streamed content through

            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters

            .. Warning::

               Using filters changes the output of ``bleach.Cleaner.clean``.
               Make sure the way the filters change the output are secure.

        """
        self.tags = tags
        self.attributes = attributes
        self.styles = styles
        self.protocols = protocols
        self.strip = strip
        self.strip_comments = strip_comments
        self.filters = filters or []

        self.parser = BleachHTMLParser(namespaceHTMLElements=False)
        self.walker = html5lib.getTreeWalker('etree')
        self.serializer = BleachHTMLSerializer(
            quote_attr_values='always',
            omit_optional_tags=False,
            escape_lt_in_attrs=True,

            # We want to leave entities as they are without escaping or
            # resolving or expanding
            resolve_entities=False,

            # Bleach has its own sanitizer, so don't use the html5lib one
            sanitize=False,

            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
            alphabetical_attributes=False,
        )
Esempio n. 10
0
def test_iterable(etree):
    walker = getTreeWalker('etree')
    stream = walker(etree)
    stream = TruncationFilter(stream, 98, end='...')

    assert stream.tree is etree

    iterator = iter(stream)
    assert iterator is not stream
    assert iter(iterator) is iterator
Esempio n. 11
0
def html_tree_to_text(html_tree):
    options = {
        'quote_attr_values': 'always',
        'use_trailing_solidus': True,
        'space_before_trailing_solidus': True
    }
    serializer = html5lib.serializer.HTMLSerializer(**options)
    walker = html5lib.getTreeWalker('etree')
    stream = serializer.serialize(walker(html_tree))
    return u''.join(stream)
Esempio n. 12
0
def test_iterable(etree):
    walker = getTreeWalker('etree')
    stream = walker(etree)
    stream = TruncationFilter(stream, 98, end='...')

    assert stream.tree is etree

    iterator = iter(stream)
    assert iterator is not stream
    assert iter(iterator) is iterator
Esempio n. 13
0
def obfuscate_emails(content):
    if isinstance(content, contents.Static):
        return

    dom = html5lib.parseFragment(content._content, treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")
    stream = walker(dom)
    stream = ObfuscateEmailsFilter(stream)
    s = html5lib.serializer.HTMLSerializer(quote_attr_values="always",
                                           omit_optional_tags=False)
    content._content = s.render(stream)
Esempio n. 14
0
def write_node(node, out):
    walker = html5lib.getTreeWalker("dom")
    stream = walker(node)
    s = html5lib.serializer.HTMLSerializer(
        quote_attr_values='always',
        minimize_boolean_attributes=False,
        use_best_quote_char=True,
        omit_optional_tags=False
    )
    for txt in s.serialize(stream):
        out.write(txt)
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment(
        '<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(alphabetical_attributes=True,
                         quote_attr_values='always')

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (ser.render(walker(dom)) ==
            '<svg><pattern id="patt1" href="#patt2"></pattern></svg>')
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(
        alphabetical_attributes=True,
        quote_attr_values='always'
    )

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (
        ser.render(walker(dom)) ==
        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
    )
Esempio n. 17
0
def trim_html(html):
    if not isinstance(html, Markup):
        raise TypeError("trim_html: expected Markup, got {!r}".format(type(html)))

    # TODO i think this could be combined with the bleach.clean call to avoid a
    # double parse?  filters apply during serialization, bleach applies during
    # tokenization
    # TODO alternatively, could this apply during tokenization to avoid
    # bothering with any markup we're not even going to show?
    tree = html5lib.parse(html)
    walker = html5lib.getTreeWalker("etree")
    stream = walker(tree)
    stream = TrimFilter(stream)
    serializer = html5lib.serializer.HTMLSerializer()

    return Markup(u"".join(serializer.serialize(stream)).strip())
Esempio n. 18
0
def trim_html(html):
    if not isinstance(html, Markup):
        raise TypeError("trim_html: expected Markup, got {!r}".format(type(html)))

    # TODO i think this could be combined with the bleach.clean call to avoid a
    # double parse?  filters apply during serialization, bleach applies during
    # tokenization
    # TODO alternatively, could this apply during tokenization to avoid
    # bothering with any markup we're not even going to show?
    tree = html5lib.parse(html)
    walker = html5lib.getTreeWalker('etree')
    stream = walker(tree)
    stream = TrimFilter(stream)
    serializer = html5lib.serializer.HTMLSerializer()

    return Markup(u''.join(serializer.serialize(stream)).strip())
Esempio n. 19
0
 def __init__(self, ignore_headers=True, raise_invalid_tags=False):
     """
     :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to
     true because the text inside of these "header elements" is typically not a sentence.
     :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS,
     BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or
     SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will
     not be included in the value returned from feed)
     """
     # self.parser is an etree parser by default.
     self.parser = html5lib.HTMLParser()
     self.walker = html5lib.getTreeWalker("etree")
     self.sentences = []
     self.ignore_header_text = ignore_headers
     self.raise_invalid_tags = raise_invalid_tags
     self.reset()
Esempio n. 20
0
def truncate(html,
             truncated_message,
             suffix,
             max_entities=None,
             max_length=None):
    walker = html5lib.getTreeWalker('etree')
    html_stream = walker(html5lib.parseFragment(html, treebuilder='etree'))
    truncated_message_stream = walker(
        html5lib.parseFragment(truncated_message, treebuilder='etree'))
    suffix_stream = walker(html5lib.parseFragment(suffix, treebuilder='etree'))
    truncated = TelegramTruncator(html_stream,
                                  truncated_message=truncated_message_stream,
                                  suffix=suffix_stream,
                                  max_entities=max_entities,
                                  max_length=max_length)
    return HTMLSerializer().render(truncated).strip('\n')
Esempio n. 21
0
def sanitize_html(html):
    """
    Make the given HTML string safe to display in a Yarrharr page.
    """
    tree = html5lib.parseFragment(html)
    serializer = html5lib.serializer.HTMLSerializer(sanitize=True)
    source = html5lib.getTreeWalker('etree')(tree)
    source = _strip_attrs(source)
    source = _drop_empty_tags(source)
    source = _ReplaceObjectFilter(source)
    source = _ElideFilter(source)
    source = _ReplaceYoutubeEmbedFilter(source)
    source = _ExtractTitleTextFilter(source)
    source = _adjust_links(source)
    source = _video_attrs(source)
    source = _wp_smileys(source)
    return serializer.render(source)
Esempio n. 22
0
def filterEpub (item):
    """ epub reader """
    book = epub.read_epub (item.rstrip ())
    logging.debug (f'reading ebook {item}')
    for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT):
        logging.debug (f'got item {item.get_name ()}')
        # XXX: in theory html5lib should be able to detect the encoding of
        # bytes(), but it does not.
        document = html5lib.parse (item.get_content ().decode ('utf-8'))
        walker = html5lib.getTreeWalker("etree")
        stream = walker (document)
        s = HTMLSerializer()
        yield ''.join (s.serialize (stream))
    # It looks like ebooklib is leaking ZipFile instances somewhere, which
    # can be prevented by resetting the book before the GC grabs it.
    book.reset ()
    del book
Esempio n. 23
0
def typogrify(html):
    # Using etree is important here because it does not suffer from a bug
    # where a text featuring entitities is split into various
    # adjacent text nodes.
    # (thanks html5lib folks for the tip).
    # See <https://github.com/html5lib/html5lib-python/issues/208>
    dom = html5lib.parseFragment(html, treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")

    stream = walker(dom)
    stream = whitespace.Filter(stream)
    stream = medor.Filter(stream)
    stream = figures.Filter(stream)

    s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    return s.render(stream)
Esempio n. 24
0
def _hyphenate_html(html):
    def hyphen_gen(stream):
        for el in stream:
            if el["type"] == "Characters":
                text = el["data"]

                text = _hyphenate(el["data"])

                el['data'] = text

            yield el

    doc = html5lib.parseFragment(html, namespaceHTMLElements=False)
    walker = html5lib.getTreeWalker('etree')
    stream = walker(doc)
    stream = hyphen_gen(stream)

    return html5lib.serializer.HTMLSerializer().render(stream)
Esempio n. 25
0
    def wbr_serialize(self):
        """Returns concatenated HTML code with WBR tag. This is still experimental.

    Returns:
      The organized HTML code. (str)
    """
        doc = ET.Element('span')
        doc.attrib['style'] = 'word-break: keep-all'
        for chunk in self:
            if (chunk.has_cjk() and doc.text):
                ele = ET.Element('wbr')
                doc.append(ele)
                doc.getchildren()[-1].tail = chunk.word
            else:
                # add word without span tag for non-CJK text (e.g. English)
                # by appending it after the last element
                if doc.getchildren():
                    if doc.getchildren()[-1].tail is None:
                        doc.getchildren()[-1].tail = chunk.word
                    else:
                        doc.getchildren()[-1].tail += chunk.word
                else:
                    if doc.text is None:
                        doc.text = chunk.word
                    else:
                        doc.text += chunk.word
        content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
        dom = html5lib.parseFragment(content)
        treewalker = getTreeWalker('etree')
        stream = treewalker(dom)
        serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always')
        allowed_elements = set(sanitizer.allowed_elements)
        allowed_elements.add((namespaces['html'], 'wbr'))
        allowed_css_properties = set(sanitizer.allowed_css_properties)
        allowed_css_properties.add('word-break')
        result = serializer.render(
            sanitizer.Filter(
                stream,
                allowed_elements=allowed_elements,
                allowed_css_properties=allowed_css_properties,
            ))
        return result
Esempio n. 26
0
def apply_linkification(
    html: str,
    skip_tags: Optional[List[str]] = None,
) -> str:
    """Apply custom linkification filter to convert text patterns to links."""
    parser = HTMLParser(namespaceHTMLElements=False)

    html_tree = parser.parseFragment(html)
    walker_stream = html5lib.getTreeWalker('etree')(html_tree)

    filtered_html_tree = LinkifyFilter(walker_stream, skip_tags)

    serializer = HTMLSerializer(
        quote_attr_values='always',
        omit_optional_tags=False,
        sanitize=False,
        alphabetical_attributes=False,
    )
    return serializer.render(filtered_html_tree)
Esempio n. 27
0
    def __init__(self,
                 callbacks=DEFAULT_CALLBACKS,
                 skip_tags=None,
                 parse_email=False,
                 url_re=URL_RE,
                 email_re=EMAIL_RE):
        """Creates a Linker instance

        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

        :arg list skip_tags: list of tags that you don't want to linkify the
            contents of; for example, you could set this to ``['pre']`` to skip
            linkifying contents of ``pre`` tags

        :arg bool parse_email: whether or not to linkify email addresses

        :arg re url_re: url matching regex

        :arg re email_re: email matching regex

        :returns: linkified text as unicode

        """
        self.callbacks = callbacks
        self.skip_tags = skip_tags
        self.parse_email = parse_email
        self.url_re = url_re
        self.email_re = email_re

        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
        self.walker = html5lib.getTreeWalker('etree')
        self.serializer = HTMLSerializer(
            quote_attr_values='always',
            omit_optional_tags=False,

            # linkify does not sanitize
            sanitize=False,

            # linkify alphabetizes
            alphabetical_attributes=False,
        )
Esempio n. 28
0
def fix_french(html):
    # Using etree is important here because it does not suffer from a bug
    # where a text featuring entities is split into various
    # adjacent text nodes.
    # (thanks html5lib folks for the tip).
    # See <https://github.com/html5lib/html5lib-python/issues/208>
    dom = html5lib.parseFragment(html, treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")

    stream = walker(dom)
    stream = whitespace.Filter(stream)
    stream = medor.Filter(stream)
    #stream = figures.Filter(stream)
    stream = hyphenate.Filter(stream, min_len=9, left=4, right=5)

    serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True,
            alphabetical_attributes=True,
            omit_optional_tags=False)

    return serializer.render(stream)
Esempio n. 29
0
def html_sanitize(text):
	if not text:
		return ''
	p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
	element = p.parseFragment(text)
	walker = getTreeWalker("etree")
	stream = walker(element)
	s = serializer.HTMLSerializer()
	text = s.render(stream)
	text = UnicodeDammit(text, ["utf-8"])
	REMOVE_ATTRIBUTES = [
		'lang','language','onmouseover','onmouseout','script','font','style',
		'dir','face','size','color','style','class','width','height','hspace',
		'border','valign','align','background','bgcolor','text','link','vlink',
		'alink','cellpadding','cellspacing', 'id']

	soup = BeautifulSoup(text.unicode_markup)
	for attribute in REMOVE_ATTRIBUTES:
		for tag in soup.findAll():

			if(attribute == 'style'):
				new_style = ''
				style = tag.attrs.get('style', None)
				if style:
					if style.find('normal') != -1: new_style += " font-weight:normal; "
					elif style.find('bold') != -1: new_style += " font-weight:bold; "
					if style.find('italic') != -1: new_style += " font-style: italic; "
					if style.find('underline') != -1: new_style += " text-decoration: underline; "
					tag.attrs['style'] = new_style

			else:
				del(tag[attribute])

	html = soup.prettify('utf-8')
	try:
		body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip()
	except IndexError:
		body = html
	return body
Esempio n. 30
0
    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
                 url_re=URL_RE, email_re=EMAIL_RE):
        """Creates a Linker instance

        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

        :arg list skip_tags: list of tags that you don't want to linkify the
            contents of; for example, you could set this to ``['pre']`` to skip
            linkifying contents of ``pre`` tags

        :arg bool parse_email: whether or not to linkify email addresses

        :arg re url_re: url matching regex

        :arg re email_re: email matching regex

        :returns: linkified text as unicode

        """
        self.callbacks = callbacks
        self.skip_tags = skip_tags
        self.parse_email = parse_email
        self.url_re = url_re
        self.email_re = email_re

        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
        self.walker = html5lib.getTreeWalker('etree')
        self.serializer = HTMLSerializer(
            quote_attr_values='always',
            omit_optional_tags=False,

            # linkify does not sanitize
            sanitize=False,

            # linkify alphabetizes
            alphabetical_attributes=False,
        )
Esempio n. 31
0
def truncate_html(html, *args, **kwargs):
    """Truncates HTML string.

    :param html: The HTML string or parsed element tree (with
                 :func:`html5lib.parse`).
    :param kwargs: Similar with :class:`.filters.TruncationFilter`.

    :return: The truncated HTML string.
    """
    if hasattr(html, 'getchildren'):
        etree = html
    else:
        etree = html5lib.parse(html)

    walker = html5lib.getTreeWalker('lxml')

    stream = walker(etree)
    stream = TruncationFilter(stream, *args, **kwargs)

    serializer = html5lib.serializer.HTMLSerializer()
    serialized = serializer.serialize(stream)

    return u''.join(serialized).strip()
Esempio n. 32
0
def truncate_html(html, *args, **kwargs):
    """Truncates HTML string.

    :param html: The HTML string or parsed element tree (with
                 :func:`html5lib.parse`).
    :param kwargs: Similar with :class:`.filters.TruncationFilter`.

    :return: The truncated HTML string.
    """
    if hasattr(html, 'getchildren'):
        etree = html
    else:
        etree = html5lib.parse(html)

    walker = html5lib.getTreeWalker('etree')

    stream = walker(etree)
    stream = TruncationFilter(stream, *args, **kwargs)

    serializer = html5lib.serializer.HTMLSerializer()
    serialized = serializer.serialize(stream)

    return u''.join(serialized).strip()
Esempio n. 33
0
 def __init__(self, ignore_headers=True, raise_invalid_tags=False):
     """
     :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to
     true because the text inside of these "header elements" is typically not a sentence.
     :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS,
     BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or
     SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will
     not be included in the value returned from feed)
     """
     # self.parser is an etree parser by default.
     self.parser = html5lib.HTMLParser()
     self.walker = html5lib.getTreeWalker("etree")
     self.sentences = []
     self.ignored_parent_count = 0
     self.current_string = ''
     self.ignore_header_text = ignore_headers
     self.raise_invalid_tags = raise_invalid_tags
     punkt_param = PunktParameters()
     abbreviations = [
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
         'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
         'Adj', 'Adm', 'Adv', 'Asst', 'Bart', 'Bldg', 'Brig', 'Bros',
         'Capt', 'Cmdr', 'Col', 'Comdr', 'Con', 'Corp', 'Cpl', 'DR', 'Dr',
         'Drs', 'Ens', 'Gen', 'Gov', 'Hon', 'Hr', 'Hosp', 'Insp', 'Lt',
         'MM', 'MR', 'MRS', 'MS', 'Maj', 'Messrs', 'Mlle', 'Mme', 'Mr',
         'Mrs', 'Ms', 'Msgr', 'Op', 'Ord', 'Pfc', 'Ph', 'Prof', 'Pvt',
         'Rep', 'Reps', 'Res', 'Rev', 'Rt', 'Sen', 'Sens', 'Sfc', 'Sgt',
         'Sr', 'St', 'Supt', 'Surg', 'v', 'vs', 'i.e', 'inc', 'rev', 'e.g',
         'etc', 'Nos', 'Nr', 'pp', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul',
         'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
     ]
     punkt_param.abbrev_types = set(abbreviations)
     self.tokenizer = PunktSentenceTokenizer(punkt_param)
     logging.basicConfig(filename='html-tokenizer.log',
                         level=logging.WARNING,
                         format='[%(asctime)s] [%(levelname)s] %(message)s',
                         datefmt='%Y-%m-%d %H:%M:%S')
Esempio n. 34
0
# the ihatexml module emits data loss warnings.  This in our case is okay
# because we are willing to accept the data loss that happens on the way
# from HTML to XML as we never go in reverse direction.  In particular the
# problem is XML namespaces which are not supported in HTML.
warnings.filterwarnings('ignore', category=DataLossWarning)


class ProcessingError(Exception):
    pass


def compile_selector(sel):
    return CSSSelector(sel, translator='html')


tree_walker = html5lib.getTreeWalker('lxml')


class Processor(object):

    def __init__(self, title_cleanup_regex=None,
                 content_selectors=None,
                 ignore=None,
                 no_default_ignores=False):
        self.content_selectors = [compile_selector(sel) for sel in
                                  content_selectors or ('body',)]
        if title_cleanup_regex is not None:
            title_cleanup_regex = re.compile(title_cleanup_regex, re.UNICODE)
        self.title_cleanup_regex = title_cleanup_regex
        self.ignore = [compile_selector(sel) for sel in ignore or ()]
        if not self.ignore and not no_default_ignores:
Esempio n. 35
0
File: util.py Progetto: dpk/ikwi
def serialize_fragment(h):
    # urgh
    walker = html5lib.getTreeWalker("etree")
    stream = walker(h)
    s = html5lib.serializer.HTMLSerializer()
    return ''.join(s.serialize(stream))[5:-6]
Esempio n. 36
0
File: corpus.py Progetto: fnl/libfnl
    'wbr',
    'xmp',
})

SPACE_PRESERVING_TAGS = frozenset({
    'pre',
    'style',
    'script',
    'textarea',
})


_ensure = lambda e, tag: e.find(tag) if e.tag != tag else e

# HTML5 serialization setup
_tree_walker = html5lib.getTreeWalker("etree", implementation=etree)
_serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False,
                                                 resolve_entities=False)

# HTML5 parsing setup
_tree_builder = html5lib.getTreeBuilder("etree", implementation=etree)
_parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False)
# FIX for HTMLParser.reset():
if not hasattr(_parser, "innerHTMLMode"):
    # add the missing attribute, as otherwise calling .reset() would raise an AttributeError
    _parser.innerHTMLMode = None


def Root(title=None, encoding=None) -> Element:
    root = Element(ROOT_TAG)
    head = SubElement(root, HEAD_TAG)
Esempio n. 37
0
    'video',
    'wbr',
    'xmp',
})

SPACE_PRESERVING_TAGS = frozenset({
    'pre',
    'style',
    'script',
    'textarea',
})

_ensure = lambda e, tag: e.find(tag) if e.tag != tag else e

# HTML5 serialization setup
_tree_walker = html5lib.getTreeWalker("etree", implementation=etree)
_serializer = html5lib.serializer.HTMLSerializer(omit_optional_tags=False,
                                                 resolve_entities=False)

# HTML5 parsing setup
_tree_builder = html5lib.getTreeBuilder("etree", implementation=etree)
_parser = html5lib.HTMLParser(_tree_builder, namespaceHTMLElements=False)
# FIX for HTMLParser.reset():
if not hasattr(_parser, "innerHTMLMode"):
    # add the missing attribute, as otherwise calling .reset() would raise an AttributeError
    _parser.innerHTMLMode = None


def Root(title=None, encoding=None) -> Element:
    root = Element(ROOT_TAG)
    head = SubElement(root, HEAD_TAG)
Esempio n. 38
0
import html5lib

with open("test_site/test.html", "rb") as f:
    element = html5lib.parse(f)
    walker = html5lib.getTreeWalker("etree")
    stream = walker(element)
    s = html5lib.serializer.HTMLSerializer()
    output = s.serialize(stream)
    for item in output:
        print("%r" % item)
Esempio n. 39
0
def filterHtml (selectFunc, fd):
    document = html5lib.parse (fd)
    walker = html5lib.getTreeWalker("etree")
    stream = walker (document)
    s = HTMLSerializer()
    yield ''.join (s.serialize(Select (stream, selectFunc)))
Esempio n. 40
0
def sanitize_html(text):
    dom = html5lib.parse(text, treebuilder='lxml')
    walker = html5lib.getTreeWalker('lxml')
    stream = _html_sanitizer_stream(walker(dom))
    return _html_serializer.render(stream)
Esempio n. 41
0
import html5lib

document1 = html5lib.parse("<p>Hello World!</p>")
print(document1)

from urllib.request import urlopen

with urlopen("http://www.google.com/") as f:
    document2 = html5lib.parse(
        f, transport_encoding=f.info().get_content_charset())
    print(document2)

document3 = html5lib.HTMLParser(
    tree=html5lib.getTreeBuilder("dom")).parse("<p>Hello World!</p>")
print(document3)

element = html5lib.parse('<p>Hello World!</p>')
walker = html5lib.getTreeWalker("etree")
stream = walker(element)
s = html5lib.serializer.HTMLSerializer().serialize(stream)
for i in s:
    print(i)

from html5lib.filters import sanitizer

dom = html5lib.parse("<script>alert('warning!')</script>", treebuilder="dom")
walker = html5lib.getTreeWalker("dom")
clean_stream = sanitizer.Filter(walker(dom))
print(clean_stream)
Esempio n. 42
0
def print_tokens(html):
    tree = html5lib.parseFragment(html)
    w = html5lib.getTreeWalker("etree")
    print("Tokens for", html)
    for token in w(tree):
        pprint(token)
Esempio n. 43
0
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

import html5lib
import unicodedata2
from lxml.etree import _Element as Element

TREE_TYPE = "lxml"

parser = html5lib.HTMLParser(
    tree=html5lib.treebuilders.getTreeBuilder(TREE_TYPE),
    namespaceHTMLElements=False,
)
walker = html5lib.getTreeWalker(TREE_TYPE)
serializer = html5lib.serializer.HTMLSerializer()

repo = Path(__file__).parent


@dataclass
class Builder:

    path_to_html: dict[Path, Element]

    @classmethod
    def from_source_dir(cls, directory: Path, /) -> Builder:
        return cls(
            path_to_html={
                i.relative_to(directory): parser.parse(i.read_bytes())
Esempio n. 44
0
def string_from_doc(doc):
    walker = html5lib.getTreeWalker("lxml")
    serializer = html5lib.serializer.HTMLSerializer()
    output = unescape_qmarks(serializer.render(walker(doc)))
    return output
Esempio n. 45
0
 def setUp(self):
     self.parser = etree.XMLParser(resolve_entities=False)
     self.treewalker = html5lib.getTreeWalker("lxml")
     self.serializer = serializer.HTMLSerializer()
Esempio n. 46
0
def render_text(s):
    s = RE_NEWLINES.sub('\n', s)
    paras = RE_LINE_SPLIT.split(s)
    paras = ['<p>%s</p>' % _process_text(p) for p in paras]
    return ''.join(paras)


if html5lib is None:
    def render_html(s):
        raise RuntimeError('Please install html5lib for "html" renderer')
else:
    _html_parser = html5lib.HTMLParser(
        tree=html5lib.treebuilders.getTreeBuilder('dom'),
        tokenizer=html5lib.sanitizer.HTMLSanitizer,
    )
    _html_walker = html5lib.getTreeWalker('dom')
    _html_serializer = html5lib.serializer.HTMLSerializer()

    def render_html(s):
        stream = _html_walker(_html_parser.parse(s))
        return u''.join(_html_serializer.serialize(stream)).strip()


renderers = {
    'markdown': render_markdown,
    'html': render_html,
    'text': render_text,
}


def markup(s):
 def setUp(self):
     self.parser = etree.XMLParser(resolve_entities=False)
     self.treewalker = html5lib.getTreeWalker("lxml")
     self.serializer = serializer.HTMLSerializer()