def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True) target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html" print(crawler.graph[target_url]) self.assertIsNotNone(crawler.graph[target_url])
def test_crawling(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler.graph.nodes) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler.graph.nodes) self.assertIn( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"] .status, 'success') self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status_code, 404) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status, 'success')
def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1", None, True) url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg" self.assertEqual(crawler.graph[url].request_type, "head")
def test_crawling(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler) self.assert_crawled_with_get( "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"] .status, 'success') self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status_code, 404) self.assertEqual( crawler.graph.nodes[ "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"] .status, 'success') self.assertIn( "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes[ "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg"] .request_type, 'head')
def test_challenge(self): crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl( "http://triplebyte.github.io/web-crawler-test-site/test4/", None, True) self.assertTrue( "https://triplebyte.github.io/web-crawler-test-site/test4/page3" in crawler.graph.nodes)
def test_challenge(self): # The bug here is that the crawler will hang. Don't sit around waiting # for it to finish! crawler = WebCrawler(5, VerboseCrawlerLogger) crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test3/", None, True) self.assertIn( "http://blah.com:7091", crawler.graph.nodes )
def test_crawling_triplebyte(self): crawler = WebCrawler(100, SilentCrawlerLogger) crawler.crawl("https://www.triplebyte.com", None, True) self.assertIn("https://www.triplebyte.com", crawler.graph.nodes) self.assertIn("https://triplebyte.com/careers", crawler.graph.nodes) self.assertEqual( crawler.graph.nodes["http://www.olark.com?welcome"].request_type, "head")
def main(): parser = argparse.ArgumentParser() parser.add_argument("target") parser.add_argument("--number_of_threads") parser.add_argument("--output_file") parser.add_argument("--verbose", help="increase output verbosity", action="store_true") args = parser.parse_args() webcrawler = WebCrawler( args.number_of_threads or 5, args.verbose and loggers.VerboseCrawlerLogger or loggers.SilentCrawlerLogger) webcrawler.crawl(args.target, args.output_file)