def goahead(self,end):

        # fix incomplete entity and char refs
        rawdata = self.rawdata

        i = 0
        n = len(rawdata)
        newdata=''

        while i < n:
            j = find(rawdata,'&',i)
            if j==-1:
                break
            newdata = newdata + rawdata[i:j]
            if charref.match(rawdata, j) or entityref.match(rawdata, j):
                newdata = newdata + '&'
            else:
                newdata = newdata + '&amp;'
            i = j+1

        self.rawdata = newdata + rawdata[i:]

        # do normal parsing
        try:
            return HTMLParser.goahead(self,end)
        except HTMLParseError:
            pass
Exemple #2
0
    def goahead(self,end):

        # fix incomplete entity and char refs        
        rawdata = self.rawdata
        
        i = 0
        n = len(rawdata)
        newdata=''
        
        while i < n:
            j = find(rawdata,'&',i)
            if j==-1:
                break
            newdata = newdata + rawdata[i:j]
            if charref.match(rawdata, j) or entityref.match(rawdata, j):
                newdata = newdata + '&'
            else:
                newdata = newdata + '&amp;'
            i = j+1
            
        self.rawdata = newdata + rawdata[i:]

        # do normal parsing
        try:
            return HTMLParser.goahead(self,end)
        except HTMLParseError:
            pass
 def goahead(self, end):
     # same as inherit except break statement in start of while loop
     rawdata = self.rawdata
     i = 0
     n = len(rawdata)
     while i < n:
         if self.isImagePageFound:  # added to stop searching through html after the image page is found
             break
         match = self.interesting.search(rawdata, i)  # < or &
         if match:
             j = match.start()
         else:
             if self.cdata_elem:
                 break
             j = n
         if i < j:
             self.handle_data(rawdata[i:j])
         i = self.updatepos(i, j)
         if i == n:
             break
         startswith = rawdata.startswith
         if startswith("<", i):
             if starttagopen.match(rawdata, i):  # < + letter
                 k = self.parse_starttag(i)
             elif startswith("</", i):
                 k = self.parse_endtag(i)
             elif startswith("<!--", i):
                 k = self.parse_comment(i)
             elif startswith("<?", i):
                 k = self.parse_pi(i)
             elif startswith("<!", i):
                 k = self.parse_html_declaration(i)
             elif (i + 1) < n:
                 self.handle_data("<")
                 k = i + 1
             else:
                 break
             if k < 0:
                 if not end:
                     break
                 k = rawdata.find(">", i + 1)
                 if k < 0:
                     k = rawdata.find("<", i + 1)
                     if k < 0:
                         k = i + 1
                 else:
                     k += 1
                 self.handle_data(rawdata[i:k])
             i = self.updatepos(i, k)
         elif startswith("&#", i):
             match = charref.match(rawdata, i)
             if match:
                 name = match.group()[2:-1]
                 self.handle_charref(name)
                 k = match.end()
                 if not startswith(";", k - 1):
                     k = k - 1
                 i = self.updatepos(i, k)
                 continue
             else:
                 if ";" in rawdata[i:]:  # bail by consuming &#
                     self.handle_data(rawdata[0:2])
                     i = self.updatepos(i, 2)
                 break
         elif startswith("&", i):
             match = entityref.match(rawdata, i)
             if match:
                 name = match.group(1)
                 self.handle_entityref(name)
                 k = match.end()
                 if not startswith(";", k - 1):
                     k = k - 1
                 i = self.updatepos(i, k)
                 continue
             match = incomplete.match(rawdata, i)
             if match:
                 # match.group() will contain at least 2 chars
                 if end and match.group() == rawdata[i:]:
                     self.error("EOF in middle of entity or char ref")
                 # incomplete
                 break
             elif (i + 1) < n:
                 # not the end of the buffer, and can't be confused
                 # with some other construct
                 self.handle_data("&")
                 i = self.updatepos(i, i + 1)
             else:
                 break
         else:
             assert 0, "interesting.search() lied"
     # end while
     if end and i < n and not self.cdata_elem:
         self.handle_data(rawdata[i:n])
         i = self.updatepos(i, n)
     self.rawdata = rawdata[i:]