Example #1
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1", None,
                      True)

        url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg"
        self.assertEqual(crawler.graph[url].request_type, "head")
Example #2
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True)

        target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html"
        print(crawler.graph[target_url])
        self.assertIsNotNone(crawler.graph[target_url])
Example #3
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test2", None, True)

        target_url = "http://triplebyte.github.io/web-crawler-test-site/test2/page2.html"

        self.assertIsNotNone(crawler.graph[target_url])
Example #4
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler)
        self.assert_crawled_with_get(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')

        self.assertIn(
            "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://cdn.business2community.com/wp-content/uploads/2013/07/terrible-content.jpg"]
            .request_type, 'head')
Example #5
0
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/",
            None, True)

        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real",
            crawler.graph.nodes)
        self.assertIn(
            "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake",
            crawler.graph.nodes)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"]
            .status, 'success')
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status_code, 404)
        self.assertEqual(
            crawler.graph.nodes[
                "http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"]
            .status, 'success')
Example #6
0
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("triplebyte.github.io/web-crawler-test-site/test1",
                      None,
                      True)

        url = "http://triplebyte.github.io/web-crawler-test-site/test1/SVG_logo.svg"
        self.assertEqual(crawler.graph[url].request_type, "head")
    def test_challenge(self):
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl(
            "http://triplebyte.github.io/web-crawler-test-site/test4/", None,
            True)

        self.assertTrue(
            "https://triplebyte.github.io/web-crawler-test-site/test4/page3" in
            crawler.graph.nodes)
    def test_crawling(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/", None, True)

        self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2", crawler.graph.nodes)
        self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real", crawler.graph.nodes)
        self.assertIn("http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake", crawler.graph.nodes)
        self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-real"].status, 'success')
        self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"].status_code, 404)
        self.assertEqual(crawler.graph.nodes["http://triplebyte.github.io/web-crawler-test-site/already-passing-tests/page2-fake"].status, 'success')
    def test_challenge(self):
        # The bug here is that the crawler will hang. Don't sit around waiting
        # for it to finish!
        crawler = WebCrawler(5, VerboseCrawlerLogger)
        crawler.crawl("http://triplebyte.github.io/web-crawler-test-site/test3/", None, True)

        self.assertIn(
            "http://blah.com:7091",
            crawler.graph.nodes
        )
    def test_crawling_triplebyte(self):
        crawler = WebCrawler(100, SilentCrawlerLogger)
        crawler.crawl("https://www.triplebyte.com", None, True)

        self.assertIn("https://www.triplebyte.com", crawler.graph.nodes)

        self.assertIn("https://triplebyte.com/careers", crawler.graph.nodes)

        self.assertEqual(
            crawler.graph.nodes["http://www.olark.com?welcome"].request_type,
            "head")
Example #11
0
def main():
    url = 'http://revistaautoesporte.globo.com/rss/ultimas/feed.xml'
    crawler = WebCrawler(url)
    data = crawler.build_data()

    crawler.data_to_file(data)
    print crawler.dump_data(data)
Example #12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("target")
    parser.add_argument("--number_of_threads")
    parser.add_argument("--output_file")
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true")

    args = parser.parse_args()

    webcrawler = WebCrawler(args.number_of_threads or 5,
                                  args.verbose and
                                  loggers.VerboseCrawlerLogger or
                                  loggers.SilentCrawlerLogger)

    webcrawler.crawl(args.target, args.output_file)
Example #13
0
def get_crawler(uri: str, chrome_driver_path, dump_to_local):
    if parse.urlparse(uri).scheme in (
            'http',
            'https',
    ):
        scraper = WebCrawler(uri, chrome_driver_path, dump_to_local)
    else:
        scraper = LocalCrawler(uri)
    return scraper
Example #14
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("target")
    parser.add_argument("--number_of_threads")
    parser.add_argument("--output_file")
    parser.add_argument("--verbose",
                        help="increase output verbosity",
                        action="store_true")

    args = parser.parse_args()

    webcrawler = WebCrawler(
        args.number_of_threads or 5,
        args.verbose and loggers.VerboseCrawlerLogger
        or loggers.SilentCrawlerLogger)

    webcrawler.crawl(args.target, args.output_file)
Example #15
0
 def get_one_day(self):
     params = {
         'q': self.city,
         'appid': self.api_key,
         'lang': self.lang,
         'units': self.units
     }
     uri = self.uri + "?" + urllib.urlencode(params, True)
     print uri
     data = WebCrawler.get_data(uri)
     print(data)
     self.decode_json(data)
     return self.speech_string
Example #16
0
class TestWebCrawler(unittest.TestCase):
    def setUp(self) -> None:
        self.content_fetcher = unittest.mock.Mock()
        self.content_fetcher.retrieve_page.return_value = self.generate_mock_page(
        )
        self.web_crawler = WebCrawler(self.content_fetcher)

    def generate_mock_page(self):
        return "<!DOCTYPE html>" \
               "<html><body><h1>Some header</h1><p>Some text</p>" \
               "<a href='http://some_link2.com'></a>" \
               "<a href='http://some_link2.com'></a>" \
               "</body></html>"

    def test_crawl_does_not_return_duplicate_links(self):
        urls = self.web_crawler.discover("http://some_link.com", limit=10)
        self.assertEqual(urls,
                         ["http://some_link.com", "http://some_link2.com"])

    def test_crawl_does_not_give_more_links_than_the_limit(self):
        urls = self.web_crawler.discover("http://some_link.com", limit=1)
        self.assertEqual(urls, ["http://some_link.com"])
Example #17
0
    def __init__(self, outfile, startUrl, limit, searchType, keyword):
        self.outfile = outfile
        self.startPage = startUrl
        self.limit = limit

        if keyword is None:
            self.keywordExists = False
        else:
            self.keywordExists = True
            self.keyword = keyword

        self.searchType = searchType
        self.currentLevel = 0
        self.webCrawler = WebCrawler(keyword)
        self.idCount = -1
        # 0 represents root level
        self.rootNode = PageNode(None, self.getUID(), startUrl, 0)
        self.activeNode = None
        self.rootError = None
        self.crawled = set()

        # seed the random integer generator for DFS method
        random.seed()
Example #18
0
 def setUp(self) -> None:
     self.content_fetcher = unittest.mock.Mock()
     self.content_fetcher.retrieve_page.return_value = self.generate_mock_page(
     )
     self.web_crawler = WebCrawler(self.content_fetcher)
Example #19
0
from parse import Parse
from webcrawler import WebCrawler
from interface import Interface

if __name__ == "__main__":
    interface = Interface()
    parse = Parse()
    args = parse.get_parse()
    parse.do_parse(args)
    webcrawler = WebCrawler(parse)
    webcrawler.get_headers(interface.header_inter())
    webcrawler.get_data(interface.data_inter())
    webcrawler.get_url(interface.url_inter())
    webcrawler.do_crawl()
Example #20
0
############################################
# Parser for mystuwe.de                    #
############################################
from webcrawler import WebCrawler
from stuweparser import StuweParser
from datetime import *

morgenstelle = "http://www.my-stuwe.de/mensa/mensa-morgenstelle-tuebingen"
wilhelm = "http://www.my-stuwe.de/mensa/mensa-wilhelmstrasse-tuebingen/"
alldaysWillhelm = "http://www.my-stuwe.de/mensa/mensa-wilhelmstrasse-tuebingen/?woche="+str(datetime.today().isocalendar()[1] + 1)
alldays = "http://www.my-stuwe.de/mensa/mensa-morgenstelle-tuebingen/?woche="+str(datetime.today().isocalendar()[1] + 1)


print("Crawling: " + alldays)
crawler = WebCrawler(alldays)
print("Crawling: " + alldaysWillhelm)
crawler2 = WebCrawler(alldaysWillhelm)

print("Start xml generation")

parser = StuweParser(crawler.getHTML())
parser2 = StuweParser(crawler2.getHTML())

try:
	#parser.generateXML()
	parser.generateWeekXML("overviewMorgen.xml")
	parser2.generateWeekXML("overviewWillhelm.xml")
	print("XML generated")
except Exception as e:
	print("An error occurred while generating xml file")
	print(e)
Example #21
0
from webcrawler import WebCrawler
import sys

firefox = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'}

try:
    user_input = sys.argv[1]
except IndexError:
    user_input = False

crawler = WebCrawler(user_input, firefox)
#crawler.validate_url()
crawler.get_amazon_image()
Example #22
0
#!/usr/bin/python
import sys
from webcrawler import WebCrawler


if __name__ == "__main__":
    website = 'https://pier31.co'
    if len(sys.argv) <= 1:
        print "\nYou didn't enter an address. Defaulting to %s" % website
    else:
        website = sys.argv[1]
        print "\nChoosen address: %s" % website

    web_crawler = WebCrawler(website)
    web_crawler.crawl_it()