def test_pop_returns_none_if_no_more_urls_are_available(monkeypatch): u = URLFrontier(Counter(), ignore_url=None) url_1 = 'http://www.example.org/' url_2 = 'http://www.example.org/index.html' elements = set() monkeypatch.setattr('time.time', lambda: 0) waittime = u.add(url_1) print(waittime) monkeypatch.setattr('time.time', lambda: waittime) waittime = u.add(url_2) print(waittime) elements.add(u.pop()) assert len(elements) == 1 assert u.pop() == None # we need another `pop` 10s later to remove the host from the list monkeypatch.setattr('time.time', lambda: waittime + 10) elements.add(u.pop()) assert len(elements) == 2 assert elements == set([url_1, url_2]) monkeypatch.setattr('time.time', lambda: waittime + 20) assert u.pop() == None # heap should be empty as well assert len(u.hosts) == 0 assert len(u.hosts.keys) == 0 assert len(u.hosts.values) == 0 # and the bucket should have been removed assert len(u.buckets) == 0
def test_adding_an_existing_url_to_frontier_doesnt_work(): u = URLFrontier(Counter(), ignore_url=None) origin_1 = 'https://en.wikipedia.org/wiki/Whatever' url_1 = 'https://www.example.org/index.html' u.add(origin_1, url_1) u.add(origin_1, url_1) assert len(u.urls) == 1
def test_popping_a_url_disable_the_host_for_a_while(monkeypatch): u = URLFrontier(Counter(), ignore_url=None) origin_1 = 'https://en.wikipedia.org/wiki/Whatever' url_1 = 'https://en.wikipedia.org/wiki/Meh' url_2 = 'https://en.wikipedia.org/wiki/Interesting_(sarcasm)' url_3 = 'https://en.wikipedia.org/wiki/Foo' elements = set() monkeypatch.setattr('time.time', lambda: 0) waittime = u.add(origin_1, set([url_1, url_2, url_3])) # pytest.set_trace() assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime) elements.add(u.pop()) assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime + 10) elements.add(u.pop()) assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime + 20) elements.add(u.pop()) assert u.pop() == None assert elements == set([url_1, url_2, url_3])
def run_crawler(arguments): stats = Counter() w = Watch(stats) w.start() q_urls = gevent.queue.Queue(settings.REQUEST_CONCURRENCY * 10) q_results = gevent.queue.Queue(settings.REQUEST_CONCURRENCY * 10) # frontier = URLFrontier(stats, ignore_url=ignore_url) frontier = URLFrontier(stats) crawlers = Group(settings.REQUEST_CONCURRENCY, Crawler, args=(q_urls, q_results, stats)) url_feeder = URLFeeder(frontier, q_urls) result_processor = ResultProcessor(frontier, q_results) # frontier.add('http://kpvz7kpmcmne52qf.onion/wiki/index.php/Main_Page') # frontier.add('http://zqktlwi4fecvo6ri.onion/wiki/index.php/Main_Page') frontier.add('https://en.wikipedia.org/') url_feeder.start() result_processor.start() return crawlers
def test_adding_url_to_frontier_adds_it_to_the_priority_queue(monkeypatch): u = URLFrontier(Counter(), ignore_url=None) origin_1 = 'https://en.wikipedia.org/wiki/Whatever' url_1 = 'https://www.example.org/index.html' # time is not important because new domains will be added to the # heap with priority zero # monkeypatch.setattr('time.time', lambda: 100) u.add(origin_1, url_1) origin_2 = 'https://www.torproject.org/i-luv-nsa.html' url_2 = 'https://internetdefenseleague.org/' # time is not important because new domains will be added to the # heap with priority zero # monkeypatch.setattr('time.time', lambda: 200) u.add(origin_2, url_2) # url_3 = 'https://internetdefenseleague.org/whatever.html' u.add(origin_2, url_2)