Example #1
0
    def __init__(self,
                 text=None,
                 avoidParserProblems=True,
                 initialTextIsEverything=True):
        """Initialize this as the 'root tag' and feed in any text to
        the parser.

        NOTE about avoidParserProblems: sgmllib will process most bad
        HTML, and BeautifulSoup has tricks for dealing with some HTML
        that kills sgmllib, but Beautiful Soup can nonetheless choke
        or lose data if your data uses self-closing tags or
        declarations incorrectly. By default, Beautiful Soup sanitizes
        its input to avoid the vast majority of these problems. The
        problems are relatively rare, even in bad HTML, so feel free
        to pass in False to avoidParserProblems if they don't apply to
        you, and you'll get better performance. The only reason I have
        this turned on by default is so I don't get so many tech
        support questions.

        The two most common instances of invalid HTML that will choke
        sgmllib are fixed by the default parser massage techniques:

         <br/> (No space between name of closing tag and tag close)
         <! --Comment--> (Extraneous whitespace in declaration)

        You can pass in a custom list of (RE object, replace method)
        tuples to get Beautiful Soup to scrub your input the way you
        want."""
        Tag.__init__(self, self.ROOT_TAG_NAME)
        if avoidParserProblems \
           and not isList(avoidParserProblems):
            avoidParserProblems = self.PARSER_MASSAGE
        self.avoidParserProblems = avoidParserProblems
        SGMLParser.__init__(self)
        self.quoteStack = []
        self.hidden = 1
        self.reset()
        if hasattr(text, 'read'):
            #It's a file-type object.
            text = text.read()
        if text:
            self.feed(text)
        if initialTextIsEverything:
            self.done()
Example #2
0
 def __getattr__(self, methodName):
     """This method routes method call requests to either the SGMLParser
     superclass or the Tag superclass, depending on the method name."""
     if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
            or methodName.find('do_') == 0:
         return SGMLParser.__getattr__(self, methodName)
     elif methodName.find('__') != 0:
         return Tag.__getattr__(self, methodName)
     else:
         raise AttributeError
Example #3
0
 def __getattr__(self, methodName):
     """This method routes method call requests to either the SGMLParser
     superclass or the Tag superclass, depending on the method name."""
     if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
            or methodName.find('do_') == 0:
         return SGMLParser.__getattr__(self, methodName)
     elif methodName.find('__') != 0:
         return Tag.__getattr__(self, methodName)
     else:
         raise AttributeError
Example #4
0
    def __init__(self, text=None, avoidParserProblems=True,
                 initialTextIsEverything=True):
        """Initialize this as the 'root tag' and feed in any text to
        the parser.

        NOTE about avoidParserProblems: sgmllib will process most bad
        HTML, and BeautifulSoup has tricks for dealing with some HTML
        that kills sgmllib, but Beautiful Soup can nonetheless choke
        or lose data if your data uses self-closing tags or
        declarations incorrectly. By default, Beautiful Soup sanitizes
        its input to avoid the vast majority of these problems. The
        problems are relatively rare, even in bad HTML, so feel free
        to pass in False to avoidParserProblems if they don't apply to
        you, and you'll get better performance. The only reason I have
        this turned on by default is so I don't get so many tech
        support questions.

        The two most common instances of invalid HTML that will choke
        sgmllib are fixed by the default parser massage techniques:

         <br/> (No space between name of closing tag and tag close)
         <! --Comment--> (Extraneous whitespace in declaration)

        You can pass in a custom list of (RE object, replace method)
        tuples to get Beautiful Soup to scrub your input the way you
        want."""
        Tag.__init__(self, self.ROOT_TAG_NAME)
        if avoidParserProblems \
           and not isList(avoidParserProblems):
            avoidParserProblems = self.PARSER_MASSAGE            
        self.avoidParserProblems = avoidParserProblems
        SGMLParser.__init__(self)
        self.quoteStack = []
        self.hidden = 1
        self.reset()
        if hasattr(text, 'read'):
            #It's a file-type object.
            text = text.read()
        if text:
            self.feed(text)
        if initialTextIsEverything:
            self.done()
Example #5
0
 def parse_declaration(self, i):
     """Treat a bogus SGML declaration as raw data. Treat a CDATA
     declaration as regular data."""
     j = None
     if self.rawdata[i:i + 9] == '<![CDATA[':
         k = self.rawdata.find(']]>', i)
         if k == -1:
             k = len(self.rawdata)
         self.handle_data(self.rawdata[i + 9:k])
         j = k + 3
     else:
         try:
             j = SGMLParser.parse_declaration(self, i)
         except SGMLParseError:
             toHandle = self.rawdata[i:]
             self.handle_data(toHandle)
             j = i + len(toHandle)
     return j
Example #6
0
 def parse_declaration(self, i):
     """Treat a bogus SGML declaration as raw data. Treat a CDATA
     declaration as regular data."""
     j = None
     if self.rawdata[i:i+9] == '<![CDATA[':
          k = self.rawdata.find(']]>', i)
          if k == -1:
              k = len(self.rawdata)
          self.handle_data(self.rawdata[i+9:k])
          j = k+3
     else:
         try:
             j = SGMLParser.parse_declaration(self, i)
         except SGMLParseError:
             toHandle = self.rawdata[i:]
             self.handle_data(toHandle)
             j = i + len(toHandle)
     return j
Example #7
0
 def reset(self):
     SGMLParser.reset(self)
     self.currentData = []
     self.currentTag = None
     self.tagStack = []
     self.pushTag(self)
Example #8
0
 def feed(self, text):
     if self.avoidParserProblems:
         for fix, m in self.avoidParserProblems:
             text = fix.sub(m, text)
     SGMLParser.feed(self, text)
Example #9
0
 def reset(self):
     SGMLParser.reset(self)
     self.currentData = []
     self.currentTag = None
     self.tagStack = []
     self.pushTag(self)        
Example #10
0
 def feed(self, text):
     if self.avoidParserProblems:
         for fix, m in self.avoidParserProblems:
             text = fix.sub(m, text)
     SGMLParser.feed(self, text)