triage.add_resource(page,resource,str(status)) return suite def crawl(): """ The crawl function calls a crawler to gather data about a website which can be used by other test suites. """ global pages try: driver=webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub',desired_capabilities=DesiredCapabilities.FIREFOX) except urllib2.URLError,e: print >> stderr, "Could not open connection to Selenium. Did you start it?" exit(1) if not delay == 0: print >> stderr, "Crawler request delay: %f seconds" % delay crawler=qa_nettools.crawler(driver,domain_filter=domain_filter,delay=delay,excludes=crawler_excludes) pages=crawler.crawl(start_url) driver.quit if __name__ == '__main__': STATUS=0 total=0 failures=0 #option parsing usage="""\ Usage: %prog --target-url URL --domain-filter STRING --wrong-url-excludes LIST Description: %prog can be used to crawl domains or sub-urls to find dead links and resources which are bad.
#!/usr/bin/env python #Created by Sam Gleske #Mon Feb 17 17:51:02 EST 2014 #Ubuntu 13.10 #Linux 3.11.0-12-generic x86_64 #Python 2.7.5+ from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from qa_nettools import crawler driver=webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub',desired_capabilities=DesiredCapabilities.FIREFOX) crawler=crawler(driver,domain_filter="example.com") pages=crawler.crawl('http://example.com/') for page in pages.keys(): for link in pages[page]: print "page:%(page)s, link:%(link)s" % {'page':page,'link':link}