예제 #1
0
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind_tolerant.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = match.group(1).lower()
        while k < endpos:
            m = attrfind_tolerant.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
                attrvalue = unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.handle_data(rawdata[i:endpos])
            return endpos
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
        return endpos
예제 #2
0
 def unescape(self, s):
     warnings.warn('The unescape method is deprecated and will be removed '
                   'in 3.5, use html.unescape() instead.',
                   DeprecationWarning, stacklevel=2)
     return unescape(s)
예제 #3
0
 def goahead(self, end):
     rawdata = self.rawdata
     i = 0
     n = len(rawdata)
     while i < n:
         if self.convert_charrefs and not self.cdata_elem:
             j = rawdata.find('<', i)
             if j < 0:
                 # if we can't find the next <, either we are at the end
                 # or there's more text incoming.  If the latter is True,
                 # we can't pass the text to handle_data in case we have
                 # a charref cut in half at end.  Try to determine if
                 # this is the case before proceeding by looking for an
                 # & near the end and see if it's followed by a space or ;.
                 amppos = rawdata.rfind('&', max(i, n-34))
                 if (amppos >= 0 and
                     not re.compile(r'[\s;]').search(rawdata, amppos)):
                     break  # wait till we get all the text
                 j = n
         else:
             match = self.interesting.search(rawdata, i)  # < or &
             if match:
                 j = match.start()
             else:
                 if self.cdata_elem:
                     break
                 j = n
         if i < j:
             if self.convert_charrefs and not self.cdata_elem:
                 self.handle_data(unescape(rawdata[i:j]))
             else:
                 self.handle_data(rawdata[i:j])
         i = self.updatepos(i, j)
         if i == n: break
         startswith = rawdata.startswith
         if startswith('<', i):
             if starttagopen.match(rawdata, i): # < + letter
                 k = self.parse_starttag(i)
             elif startswith("</", i):
                 k = self.parse_endtag(i)
             elif startswith("<!--", i):
                 k = self.parse_comment(i)
             elif startswith("<?", i):
                 k = self.parse_pi(i)
             elif startswith("<!", i):
                 k = self.parse_html_declaration(i)
             elif (i + 1) < n:
                 self.handle_data("<")
                 k = i + 1
             else:
                 break
             if k < 0:
                 if not end:
                     break
                 k = rawdata.find('>', i + 1)
                 if k < 0:
                     k = rawdata.find('<', i + 1)
                     if k < 0:
                         k = i + 1
                 else:
                     k += 1
                 if self.convert_charrefs and not self.cdata_elem:
                     self.handle_data(unescape(rawdata[i:k]))
                 else:
                     self.handle_data(rawdata[i:k])
             i = self.updatepos(i, k)
         elif startswith("&#", i):
             match = charref.match(rawdata, i)
             if match:
                 name = match.group()[2:-1]
                 self.handle_charref(name)
                 k = match.end()
                 if not startswith(';', k-1):
                     k = k - 1
                 i = self.updatepos(i, k)
                 continue
             else:
                 if ";" in rawdata[i:]:  # bail by consuming &#
                     self.handle_data(rawdata[i:i+2])
                     i = self.updatepos(i, i+2)
                 break
         elif startswith('&', i):
             match = entityref.match(rawdata, i)
             if match:
                 name = match.group(1)
                 self.handle_entityref(name)
                 k = match.end()
                 if not startswith(';', k-1):
                     k = k - 1
                 i = self.updatepos(i, k)
                 continue
             match = incomplete.match(rawdata, i)
             if match:
                 # match.group() will contain at least 2 chars
                 if end and match.group() == rawdata[i:]:
                     k = match.end()
                     if k <= i:
                         k = n
                     i = self.updatepos(i, i + 1)
                 # incomplete
                 break
             elif (i + 1) < n:
                 # not the end of the buffer, and can't be confused
                 # with some other construct
                 self.handle_data("&")
                 i = self.updatepos(i, i + 1)
             else:
                 break
         else:
             assert 0, "interesting.search() lied"
     # end while
     if end and i < n and not self.cdata_elem:
         if self.convert_charrefs and not self.cdata_elem:
             self.handle_data(unescape(rawdata[i:n]))
         else:
             self.handle_data(rawdata[i:n])
         i = self.updatepos(i, n)
     self.rawdata = rawdata[i:]
예제 #4
0
 def check_num(num, expected):
     for format in numeric_formats:
         text = format % num
         self.assertEqual(html.unescape(text),
                          expected,
                          msg=errmsg % (text, expected))
예제 #5
0
 def check(text, expected):
     self.assertEqual(html.unescape(text),
                      expected,
                      msg=errmsg % (text, expected))
예제 #6
0
 def test_unescape_method(self):
     from backports.html import unescape
     p = self.get_collector()
     s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
     self.assertEqual(p.unescape(s), unescape(s))