Beispiel #1
0
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = BeautifulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
Beispiel #2
0
 def __init__(self, namespaceHTMLElements, soup=None):
     if soup:
         self.soup = soup
     else:
         from contrib.bs4 import BeautifulSoup
         self.soup = BeautifulSoup("", "html.parser")
     super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
Beispiel #3
0
 def Login( self, login, password ):
   # Don't login twice.
   if not self._is_loggedin:
     if not login:
       raise RuntimeError( self._errmsg_cannot_login + " " +
                           self._errmsg_empty_email )
     if not password:
       raise RuntimeError( self._errmsg_cannot_login + " " +
                           self._errmsg_empty_pass )
     login_data = urllib.urlencode( {
       "email" : login,
       "pass" : password
     } )
     response = self._opener.open( _COSMIC_LOGIN_URL, login_data )
     htmldata = BeautifulSoup( response.read(), features="html.parser" )
     login_info = htmldata.find( "dd", { "class": "login-error" } )
     if login_info is not None:
       vispass = None if not password else "*" * len( password )
       errormsg = self._errmsg_cannot_login + " " + \
         ( self._errmsg_credentials % ( login, vispass ) )
       servermsg = ""
       for msg in htmldata.find_all( "h3" ):
         servermsg += msg.string + "."
       if servermsg is not None:
         errormsg += " " + ( self._errmsg_servermsg % servermsg )
       raise RuntimeError( errormsg )
     self._is_loggedin = True
Beispiel #4
0
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
"""
        soup = BeautifulSoup(doc, "lxml-xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
        encoded = soup.encode()
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
Beispiel #5
0
 def FindGeneID( self, name ):
   if not self._is_loggedin:
     raise RuntimeError( self._errmsg_no_login )
   if name is None:
     raise RuntimeError( self._errmsg_empty_gene )
   data = self._opener.open( _COSMIC_SEARCH_URL % name )
   htmldata = BeautifulSoup( data.read(), features="html.parser" )
   ids = htmldata.find_all( "input", { "name": "id", "type": "hidden" } )
   lns = htmldata.find_all( "input", { "name": "ln", "type": "hidden" } )
   id_count = len( ids )
   if id_count == 0:
     raise RuntimeError( self._errmsg_invalid_gene )
   for i in range( id_count ):
     if lns[i].get( "value" ) != name:
       continue
     return int( ids[i].get( "value" ) )
   raise RuntimeError( self._errmsg_parse_error )
Beispiel #6
0
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Beispiel #7
0
 def fragmentClass(self):
     from contrib.bs4 import BeautifulSoup
     self.soup = BeautifulSoup("", "html.parser")
     self.soup.name = "[document_fragment]"
     return Element(self.soup, self.soup, None)
Beispiel #8
0
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements, soup=None):
        if soup:
            self.soup = soup
        else:
            from contrib.bs4 import BeautifulSoup
            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from contrib.bs4 import BeautifulSoup
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from contrib.bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in element.attrs.items():
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)
Beispiel #9
0
        try:
            if os.path.exists(data):
                print '"%s" looks like a filename. Reading data from the file.' % data
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
        print

    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "Here's what %s did with the markup:" % parser
            print soup.prettify()

        print "-" * 80

def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
Beispiel #10
0
 def soup(self, markup, **kwargs):
     """Build a Beautiful Soup object from markup."""
     builder = kwargs.pop('builder', self.default_builder)
     return BeautifulSoup(markup, builder=builder, **kwargs)