def test_basic_html(self): html_sample = """<!DOCTYPE html> <html> <head> <title></title> </head> <body> </body> </html>""" finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), [])
def test_with_relative_link_and_relative_base_href(self): html_sample = """<!DOCTYPE html> <html> <head> <base href="/someWeirdPath/" target="_self"> </head> <body> <a href="test.html">test</a> </body> </html>""" finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), ["/someWeirdPath/test.html"]) html_sample = """<!DOCTYPE html> <html> <head> <base href="someWeirdPath/" target="_self"> </head> <body> <a href="test.html">test</a> </body> </html>""" finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), ["someWeirdPath/test.html"])
def get_requests(self): # Shared.options['process_timeout'] if self.request.method == "POST": raise Exception("POST method with urllib is not supported yet") #parent = self.request.parent.url if self.request.parent else "" self.retries_interval = 0.5 jar_response = cookielib.LWPCookieJar() jar_request = cookielib.LWPCookieJar() html = "" set_cookie = [] requests = [] while True: try: #Shared.th_lock.acquire() for cookie in self.request.cookies: jar_request.set_cookie(cookie.get_cookielib_cookie()) #Shared.th_lock.release() opener = self.urllib2_opener(self.request, jar_response) req = urllib2.Request(url=self.request.url) jar_request.add_cookie_header(req) res = opener.open(req, None, self.timeout) for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) ctype = res.info( )['Content-Type'] # @TODO !! WRONG!! (check if wrong...not sure) if ctype is not None: if ctype.lower().split(";")[0] != "text/html": opener.close() raise NotHtmlException(ERROR_CONTENTTYPE) html = res.read() opener.close() if html: html = decode_bytes(html) finder = UrlFinder(html) try: urls = finder.get_urls() except Exception as e: raise for url in urls: # @TODO handle FORMS requests.append( Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)) break except RedirectException as e: set_cookie = [] for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id) requests.append(r) break except NotHtmlException: raise except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return requests
def test_with_http_absolute_link(self): html_sample = '<a href="http://test.lan">test</a>' finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), ["http://test.lan"])
def test_with_anchor_link(self): html_sample = '<a href="#test">test</a>' finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), [])
def test_empty_html(self): html_sample = "" finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), [])
def test_with_relative_link(self): html_sample = '<a href="test.html">test</a>' finder = UrlFinder(html_sample) self.assertEqual(finder.get_urls(), ["test.html"])
def get_requests(self): # Shared.options['process_timeout'] if self.request.method == "POST": raise Exception("POST method with urllib is not supported yet") #parent = self.request.parent.url if self.request.parent else "" self.retries_interval = 0.5 jar_response = cookielib.LWPCookieJar() jar_request = cookielib.LWPCookieJar() html = "" set_cookie = [] requests = [] while True: try : #Shared.th_lock.acquire() for cookie in self.request.cookies: jar_request.set_cookie(cookie.get_cookielib_cookie()) #Shared.th_lock.release() opener = self.urllib2_opener(self.request, jar_response) req = urllib2.Request(url=self.request.url) jar_request.add_cookie_header(req) res = opener.open(req, None, self.timeout) for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) ctype = res.info()['Content-Type'] if ctype is not None: if ctype.lower().split(";")[0] != "text/html": opener.close() raise NotHtmlException(ERROR_CONTENTTYPE) html = res.read() opener.close() if html: finder = UrlFinder(html) try: urls = finder.get_urls() except Exception as e: raise for url in urls: # @TODO handle FORMS requests.append(Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)) break except RedirectException as e: set_cookie = [] for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id) requests.append(r) break except NotHtmlException: raise except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return requests