def test_escape_html(self): """Test escape HTML on selected codepoints.""" test_input = test_common.ASCII_AND_SELECTED_CODEPOINTS want = ( u'�\x01\x02\x03\x04\x05\x06\x07' u'\x08\t\n\x0B\x0C\r\x0E\x0F' u'\x10\x11\x12\x13\x14\x15\x16\x17' u'\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f' u' !"#$%&'()*+,-./' u'0123456789:;<=>?' u'@ABCDEFGHIJKLMNO' u'PQRSTUVWXYZ[\]^_' u'`abcdefghijklmno' u'pqrstuvwxyz{|}~\x7f' u'\u00A0\u0100\u2028\u2029\ufdec\ufeff\U0001D11E') got = escaping.escape_html(test_input) self.assertEquals( want, got, 'escaped:\n\t%r\n!=\n\t%r' % (want, got)) want, got = u'\ufffd%s' % test_input[1:], html.unescape_html(got) self.assertEquals( want, got, 'reversible:\n\t%r\n!=\n\t%r' % (want, got)) self.assertEquals('42', escaping.escape_html(42)) self.assertEquals('', escaping.escape_html(None))
def test_escape_html(self): """Test escape HTML on selected codepoints.""" test_input = test_common.ASCII_AND_SELECTED_CODEPOINTS want = (u'�\x01\x02\x03\x04\x05\x06\x07' u'\x08\t\n\x0B\x0C\r\x0E\x0F' u'\x10\x11\x12\x13\x14\x15\x16\x17' u'\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f' u' !"#$%&'()*+,-./' u'0123456789:;<=>?' u'@ABCDEFGHIJKLMNO' u'PQRSTUVWXYZ[\]^_' u'`abcdefghijklmno' u'pqrstuvwxyz{|}~\x7f' u'\u00A0\u0100\u2028\u2029\ufdec\ufeff\U0001D11E') got = escaping.escape_html(test_input) self.assertEquals(want, got, 'escaped:\n\t%r\n!=\n\t%r' % (want, got)) want, got = u'\ufffd%s' % test_input[1:], html.unescape_html(got) self.assertEquals(want, got, 'reversible:\n\t%r\n!=\n\t%r' % (want, got)) self.assertEquals('42', escaping.escape_html(42)) self.assertEquals('', escaping.escape_html(None))
def process_raw_text(raw_text, context): """ raw_text - A chunk of HTML/CSS/JS. context - The context before raw_text. Returns ( the context after raw_text which may be an error context, a normalized version of the text or None if an error occurred, None or the context immediately prior to the error, None or the unprocessed suffix of raw_text when the error occurred) May raise ContextUpdateFailure which is equivalent to returning STATE_ERROR but with a more informative error message. """ normalized = StringIO() while raw_text: prior_context, prior_raw_text = context, raw_text delim_type = delim_type_of(context) # If we are in an attribute value, then decode raw_text (except # for the delimiter) up to the next occurrence of delimiter. # The end of the section to decode. Either before a delimiter # or > symbol that closes an attribute, at the end of the raw_text, # or -1 if no decoding needs to happen. attr_value_end = _end_of_attr_value(raw_text, delim_type) if attr_value_end == -1: # Outside an attribute value. No need to decode. num_consumed, context, replacement_text = _process_next_token( raw_text, context) raw_text = raw_text[num_consumed:] normalized.write(replacement_text) if delim_type_of(context) == DELIM_SPACE_OR_TAG_END: # Introduce a double quote when we transition into an unquoted # attribute body. normalized.write('"') else: # Inside an attribute value. Find the end and decode up to it. if delim_type == DELIM_SPACE_OR_TAG_END: # Check for suspicious characters in the value. # http://www.w3.org/TR/html5/tokenization.html # #attribute-value-unquoted-state # identifies [\0"'<=`] as transitions to error states. # If they occur in an unquoted value they are almost surely # an indication of an error in the template. bad = re.search(r'[\x00"\'<=`]', raw_text[:attr_value_end]) if bad: raise ContextUpdateFailure( '%r in unquoted attr: %r' % (bad.group(), raw_text[:attr_value_end])) # All of the languages we deal with (HTML, CSS, and JS) use # quotes as delimiters. # When one language is embedded in the other, we need to # decode delimiters before trying to parse the content in the # embedded language. # For example, in # <a onclick="alert("Hello {$world}")"> # the decoded value of the event handler is # alert("Hello {$world}") # so to determine the appropriate escaping convention we decode # the attribute value before delegating to _process_next_token. # We could take the cross-product of two languages to avoid # decoding but that leads to either an explosion in the # number of states, or the amount of lookahead required. # The end of the attribute value. At attr_value_end, or # attr_value_end + 1 if a delimiter needs to be consumed. if attr_value_end < len(raw_text): attr_end = attr_value_end + len(DELIM_TEXT[delim_type]) else: attr_end = -1 # Decode so that the JavaScript rules work on attribute values # like # <a onclick='alert("{$msg}!")'> # If we've already processed the tokens "<a", " onclick='" to # get into the single quoted JS attribute context, then we do # three things: # (1) This class will decode """ to "\"" and work below # to go from STATE_JS to STATE_JSDQ_STR. # (2) Then the caller checks {$msg} and realizes that $msg is # part of a JS string. # (3) Then, the above will identify the "'" as the end, and # so we reach here with: # r a w T e x t = " ! & q u o t ; ) ' > " # ^ ^ # attr_value_end attr_end # We use this example more in the comments below. attr_value_tail = html.unescape_html(raw_text[:attr_value_end]) # attr_value_tail is "!\")" in the example above. if delim_type == DELIM_SINGLE_QUOTE: escaper = escaping.escape_html_sq_only else: escaper = escaping.escape_html_dq_only # Recurse on the decoded value. while attr_value_tail: num_consumed, context, replacement = _process_next_token( attr_value_tail, context) attr_value_tail = attr_value_tail[num_consumed:] normalized.write(escaper(replacement)) # TODO: Maybe check that context is legal to end an attr in. # Throw if the attribute ends inside a quoted string. if attr_end != -1: raw_text = raw_text[attr_end:] # raw_text is now ">" from the example above. # When an attribute ends, we're back in the tag. context = STATE_TAG | element_type_of(context) # Append the delimiter on exiting an attribute. if delim_type == DELIM_SINGLE_QUOTE: normalized.write("'") else: # Inserts an end quote for unquoted attributes. normalized.write('"') else: # Whole tail is part of an unterminated attribute. if attr_value_end != len(raw_text): # pragma: no cover raise AssertionError() # Illegal state. raw_text = "" if is_error_context(context): return (context, None, prior_context, prior_raw_text) return (context, normalized.getvalue(), None, None)
def test_unescape_html(self): """ Test unescape_html on corner cases like supplemental codepoints, re-escaping, broken escapes, etc. """ self.assertEquals('', html.unescape_html('')) self.assertEquals('foo', html.unescape_html('foo')) self.assertEquals('foo<bar', html.unescape_html('foo<bar')) self.assertEquals('foo< bar', html.unescape_html('foo< bar')) self.assertEquals('foo&bar', html.unescape_html('foo&amp;bar')) self.assertEquals('foo&bogus;bar', html.unescape_html('foo&bogus;bar')) self.assertEquals(u'>>>\u226b>', html.unescape_html('>>>≫&gt;')) self.assertEquals('""""', html.unescape_html('""""')) self.assertEquals('<<<<', html.unescape_html('<<<<')) self.assertEquals(u'\u1234\u1234', html.unescape_html('ሴሴ')) self.assertEquals(u'\uabcd\uabcd', html.unescape_html('ꯍꯍ')) self.assertEquals(u"\U0001D11E\U0001D11E", html.unescape_html('𝄞��')) self.assertEquals("&#;&#gt;&#xxa0;", "&#;&#gt;&#xxa0;")
def test_unescape_html(self): """ Test unescape_html on corner cases like supplemental codepoints, re-escaping, broken escapes, etc. """ self.assertEquals('', html.unescape_html('')) self.assertEquals('foo', html.unescape_html('foo')) self.assertEquals('foo<bar', html.unescape_html('foo<bar')) self.assertEquals('foo< bar', html.unescape_html('foo< bar')) self.assertEquals('foo&bar', html.unescape_html('foo&amp;bar')) self.assertEquals('foo&bogus;bar', html.unescape_html('foo&bogus;bar')) self.assertEquals( u'>>>\u226b>', html.unescape_html('>>>≫&gt;')) self.assertEquals( '""""', html.unescape_html('""""')) self.assertEquals( '<<<<', html.unescape_html('<<<<')) self.assertEquals( u'\u1234\u1234', html.unescape_html('ሴሴ')) self.assertEquals( u'\uabcd\uabcd', html.unescape_html('ꯍꯍ')) self.assertEquals( u"\U0001D11E\U0001D11E", html.unescape_html('𝄞��')) self.assertEquals("&#;&#gt;&#xxa0;", "&#;&#gt;&#xxa0;")