Esempio n. 1
0
 def __init__(self, http_resp):
     # Save "_parse" reference
     orig_parse = self._parse
     # Monkeypatch it!
     self._parse = lambda arg: None
     # Now call parent's __init__
     HTMLParser.__init__(self, http_resp)
     # Restore it
     self._parse = orig_parse
Esempio n. 2
0
 def __init__(self, http_resp):
     # Save "_parse" reference
     orig_parse = self._parse
     # Monkeypatch it!
     self._parse = lambda arg: None
     # Now call parent's __init__
     HTMLParser.__init__(self, http_resp)
     # Restore it
     self._parse = orig_parse
Esempio n. 3
0
    def __init__(self, http_resp):

        # Create the proper parser instance, please note that
        # the order in which we ask for the type is not random,
        # first we discard the images which account for a great
        # % of the URLs in a site, then we ask for WML which is
        # a very specific thing to match, then we try text or HTML
        # which is very generic (if we would have exchanged these two
        # we would have never got to WML), etc.
        if http_resp.is_image():
            msg = 'There is no parser for images.'
            raise BaseFrameworkException(msg)
        elif self._is_wml(http_resp):
            parser = WMLParser(http_resp)
        elif http_resp.is_text_or_html():
            parser = HTMLParser(http_resp)
        elif self._is_pdf(http_resp):
            parser = PDFParser(http_resp)
        elif self._is_swf(http_resp):
            parser = SWFParser(http_resp)
        else:
            msg = 'There is no parser for "%s".' % http_resp.get_url()
            raise BaseFrameworkException(msg)

        self._parser = parser
    def test_parse_html_performance(self):
        headers = Headers()
        headers['content-type'] = 'text/html'
        body = file(self.HTML_FILE).read()
        url = URL('http://www.w3af.org/')
        response = HTTPResponse(200, body, headers, url, url, charset='utf-8')

        #self.measure_memory(1)

        parsers = []

        for _ in xrange(40):
            p = HTMLParser(response)
            p.get_dom()
            #parsers.append(p)

        # Clear any reference to the parser
        #del p
        #parsers = []

        #self.measure_memory(2)

        time.sleep(360)