def __init__(self, text=None, avoidParserProblems=True, initialTextIsEverything=True): """Initialize this as the 'root tag' and feed in any text to the parser. NOTE about avoidParserProblems: sgmllib will process most bad HTML, and BeautifulSoup has tricks for dealing with some HTML that kills sgmllib, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. By default, Beautiful Soup sanitizes its input to avoid the vast majority of these problems. The problems are relatively rare, even in bad HTML, so feel free to pass in False to avoidParserProblems if they don't apply to you, and you'll get better performance. The only reason I have this turned on by default is so I don't get so many tech support questions. The two most common instances of invalid HTML that will choke sgmllib are fixed by the default parser massage techniques: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get Beautiful Soup to scrub your input the way you want.""" Tag.__init__(self, self.ROOT_TAG_NAME) if avoidParserProblems \ and not isList(avoidParserProblems): avoidParserProblems = self.PARSER_MASSAGE self.avoidParserProblems = avoidParserProblems SGMLParser.__init__(self) self.quoteStack = [] self.hidden = 1 self.reset() if hasattr(text, 'read'): #It's a file-type object. text = text.read() if text: self.feed(text) if initialTextIsEverything: self.done()
def __getattr__(self, methodName): """This method routes method call requests to either the SGMLParser superclass or the Tag superclass, depending on the method name.""" if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ or methodName.find('do_') == 0: return SGMLParser.__getattr__(self, methodName) elif methodName.find('__') != 0: return Tag.__getattr__(self, methodName) else: raise AttributeError
def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as regular data.""" j = None if self.rawdata[i:i + 9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) self.handle_data(self.rawdata[i + 9:k]) j = k + 3 else: try: j = SGMLParser.parse_declaration(self, i) except SGMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j
def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as regular data.""" j = None if self.rawdata[i:i+9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) self.handle_data(self.rawdata[i+9:k]) j = k+3 else: try: j = SGMLParser.parse_declaration(self, i) except SGMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j
def reset(self): SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.pushTag(self)
def feed(self, text): if self.avoidParserProblems: for fix, m in self.avoidParserProblems: text = fix.sub(m, text) SGMLParser.feed(self, text)