def goahead(self,end): # fix incomplete entity and char refs rawdata = self.rawdata i = 0 n = len(rawdata) newdata='' while i < n: j = find(rawdata,'&',i) if j==-1: break newdata = newdata + rawdata[i:j] if charref.match(rawdata, j) or entityref.match(rawdata, j): newdata = newdata + '&' else: newdata = newdata + '&' i = j+1 self.rawdata = newdata + rawdata[i:] # do normal parsing try: return HTMLParser.goahead(self,end) except HTMLParseError: pass
def goahead(self, end): # same as inherit except break statement in start of while loop rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.isImagePageFound: # added to stop searching through html after the image page is found break match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: if self.cdata_elem: break j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith("<", i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("</", i): k = self.parse_endtag(i) elif startswith("<!--", i): k = self.parse_comment(i) elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 else: break if k < 0: if not end: break k = rawdata.find(">", i + 1) if k < 0: k = rawdata.find("<", i + 1) if k < 0: k = i + 1 else: k += 1 self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(";", k - 1): k = k - 1 i = self.updatepos(i, k) continue else: if ";" in rawdata[i:]: # bail by consuming &# self.handle_data(rawdata[0:2]) i = self.updatepos(i, 2) break elif startswith("&", i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(";", k - 1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: self.error("EOF in middle of entity or char ref") # incomplete break elif (i + 1) < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") i = self.updatepos(i, i + 1) else: break else: assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:]