def test_pop_returns_none_if_no_more_urls_are_available(monkeypatch): u = URLFrontier(Counter(), ignore_url=None) url_1 = 'http://www.example.org/' url_2 = 'http://www.example.org/index.html' elements = set() monkeypatch.setattr('time.time', lambda: 0) waittime = u.add(url_1) print(waittime) monkeypatch.setattr('time.time', lambda: waittime) waittime = u.add(url_2) print(waittime) elements.add(u.pop()) assert len(elements) == 1 assert u.pop() == None # we need another `pop` 10s later to remove the host from the list monkeypatch.setattr('time.time', lambda: waittime + 10) elements.add(u.pop()) assert len(elements) == 2 assert elements == set([url_1, url_2]) monkeypatch.setattr('time.time', lambda: waittime + 20) assert u.pop() == None # heap should be empty as well assert len(u.hosts) == 0 assert len(u.hosts.keys) == 0 assert len(u.hosts.values) == 0 # and the bucket should have been removed assert len(u.buckets) == 0
def test_popping_a_url_disable_the_host_for_a_while(monkeypatch): u = URLFrontier(Counter(), ignore_url=None) origin_1 = 'https://en.wikipedia.org/wiki/Whatever' url_1 = 'https://en.wikipedia.org/wiki/Meh' url_2 = 'https://en.wikipedia.org/wiki/Interesting_(sarcasm)' url_3 = 'https://en.wikipedia.org/wiki/Foo' elements = set() monkeypatch.setattr('time.time', lambda: 0) waittime = u.add(origin_1, set([url_1, url_2, url_3])) # pytest.set_trace() assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime) elements.add(u.pop()) assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime + 10) elements.add(u.pop()) assert u.pop() == None monkeypatch.setattr('time.time', lambda: waittime + 20) elements.add(u.pop()) assert u.pop() == None assert elements == set([url_1, url_2, url_3])