Ejemplo n.º 1
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True)

        target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html"
        print(crawler.graph[target_url])
        self.assertIsNotNone(crawler.graph[target_url])
Ejemplo n.º 2
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')
Ejemplo n.º 3
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1", None,
                      True)

        url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg"
        self.assertEqual(crawler.graph[url].request_type, "head")
Ejemplo n.º 4
0
def main():
    url = 'http://revistaautoesporte.globo.com/rss/ultimas/feed.xml'
    crawler = WebCrawler(url)
    data = crawler.build_data()

    crawler.data_to_file(data)
    print crawler.dump_data(data)
Ejemplo n.º 5
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')

        self.assertIn(
            "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg"]
            .request_type, 'head')
Ejemplo n.º 6
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/test4/", None,
            True)

        self.assertTrue(
            "https://triplebyte.github.io/web-crawler-test-site/test4/page3" in
            crawler.graph.nodes)
Ejemplo n.º 7
0
def get_crawler(uri: str, chrome_driver_path, dump_to_local):
    if parse.urlparse(uri).scheme in (
            'http',
            'https',
    ):
        scraper = WebCrawler(uri, chrome_driver_path, dump_to_local)
    else:
        scraper = LocalCrawler(uri)
    return scraper
Ejemplo n.º 8
0
    def test_challenge(self):
        # The bug here is that the crawler will hang. Don't sit around waiting
        # for it to finish!
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test3/", None, True)

        self.assertIn(
            "http://blah.com:7091",
            crawler.graph.nodes
        )
    def test_crawling_triplebyte(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl("https://www.triplebyte.com", None, True)

        self.assertIn("https://www.triplebyte.com", crawler.graph.nodes)

        self.assertIn("https://triplebyte.com/careers", crawler.graph.nodes)

        self.assertEqual(
            crawler.graph.nodes["http://www.olark.com?welcome"].request_type,
            "head")
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("target")
    parser.add_argument("--number_of_threads")
    parser.add_argument("--output_file")
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true")

    args = parser.parse_args()

    webcrawler = WebCrawler(
        args.number_of_threads or 5,
        args.verbose and loggers.VerboseCrawlerLogger
        or loggers.SilentCrawlerLogger)

    webcrawler.crawl(args.target, args.output_file)
Ejemplo n.º 11
0
    def __init__(self, outfile, startUrl, limit, searchType, keyword):
        self.outfile = outfile
        self.startPage = startUrl
        self.limit = limit

        if keyword is None:
            self.keywordExists = False
        else:
            self.keywordExists = True
            self.keyword = keyword

        self.searchType = searchType
        self.currentLevel = 0
        self.webCrawler = WebCrawler(keyword)
        self.idCount = -1
        # 0 represents root level
        self.rootNode = PageNode(None, self.getUID(), startUrl, 0)
        self.activeNode = None
        self.rootError = None
        self.crawled = set()

        # seed the random integer generator for DFS method
        random.seed()
Ejemplo n.º 12
0
from parse import Parse
from webcrawler import WebCrawler
from interface import Interface

if __name__ == "__main__":
    interface = Interface()
    parse = Parse()
    args = parse.get_parse()
    parse.do_parse(args)
    webcrawler = WebCrawler(parse)
    webcrawler.get_headers(interface.header_inter())
    webcrawler.get_data(interface.data_inter())
    webcrawler.get_url(interface.url_inter())
    webcrawler.do_crawl()
Ejemplo n.º 13
0
 def setUp(self) -> None:
     self.content_fetcher = unittest.mock.Mock()
     self.content_fetcher.retrieve_page.return_value = self.generate_mock_page(
     )
     self.web_crawler = WebCrawler(self.content_fetcher)
Ejemplo n.º 14
0
#!/usr/bin/python
import sys
from webcrawler import WebCrawler


if __name__ == "__main__":
    website = 'https://pier31.co'
    if len(sys.argv) <= 1:
        print "\nYou didn't enter an address. Defaulting to %s" % website
    else:
        website = sys.argv[1]
        print "\nChoosen address: %s" % website

    web_crawler = WebCrawler(website)
    web_crawler.crawl_it()