Exemple #1
0
def test_pop_returns_none_if_no_more_urls_are_available(monkeypatch):
    u = URLFrontier(Counter(), ignore_url=None)
    url_1 = 'http://www.example.org/'
    url_2 = 'http://www.example.org/index.html'
    elements = set()

    monkeypatch.setattr('time.time', lambda: 0)
    waittime = u.add(url_1)
    print(waittime)

    monkeypatch.setattr('time.time', lambda: waittime)
    waittime = u.add(url_2)
    print(waittime)

    elements.add(u.pop())
    assert len(elements) == 1
    assert u.pop() == None

    # we need another `pop` 10s later to remove the host from the list
    monkeypatch.setattr('time.time', lambda: waittime + 10)
    elements.add(u.pop())
    assert len(elements) == 2
    assert elements == set([url_1, url_2])
    monkeypatch.setattr('time.time', lambda: waittime + 20)
    assert u.pop() == None

    # heap should be empty as well
    assert len(u.hosts) == 0
    assert len(u.hosts.keys) == 0
    assert len(u.hosts.values) == 0

    # and the bucket should have been removed
    assert len(u.buckets) == 0
Exemple #2
0
def test_pop_returns_none_if_no_more_urls_are_available(monkeypatch):
    u = URLFrontier(Counter(), ignore_url=None)
    url_1 = 'http://www.example.org/'
    url_2 = 'http://www.example.org/index.html'
    elements = set()

    monkeypatch.setattr('time.time', lambda: 0)
    waittime = u.add(url_1)
    print(waittime)

    monkeypatch.setattr('time.time', lambda: waittime)
    waittime = u.add(url_2)
    print(waittime)

    elements.add(u.pop())
    assert len(elements) == 1
    assert u.pop() == None

    # we need another `pop` 10s later to remove the host from the list
    monkeypatch.setattr('time.time', lambda: waittime + 10)
    elements.add(u.pop())
    assert len(elements) == 2
    assert elements == set([url_1, url_2])
    monkeypatch.setattr('time.time', lambda: waittime + 20)
    assert u.pop() == None

    # heap should be empty as well
    assert len(u.hosts) == 0
    assert len(u.hosts.keys) == 0
    assert len(u.hosts.values) == 0

    # and the bucket should have been removed
    assert len(u.buckets) == 0
Exemple #3
0
def test_popping_a_url_disable_the_host_for_a_while(monkeypatch):
    u = URLFrontier(Counter(), ignore_url=None)
    origin_1 = 'https://en.wikipedia.org/wiki/Whatever'
    url_1 = 'https://en.wikipedia.org/wiki/Meh'
    url_2 = 'https://en.wikipedia.org/wiki/Interesting_(sarcasm)'
    url_3 = 'https://en.wikipedia.org/wiki/Foo'
    elements = set()

    monkeypatch.setattr('time.time', lambda: 0)
    waittime = u.add(origin_1, set([url_1, url_2, url_3]))
    # pytest.set_trace()
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime)
    elements.add(u.pop())
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime + 10)
    elements.add(u.pop())
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime + 20)
    elements.add(u.pop())
    assert u.pop() == None

    assert elements == set([url_1, url_2, url_3])
Exemple #4
0
def test_popping_a_url_disable_the_host_for_a_while(monkeypatch):
    u = URLFrontier(Counter(), ignore_url=None)
    origin_1 = 'https://en.wikipedia.org/wiki/Whatever'
    url_1 = 'https://en.wikipedia.org/wiki/Meh'
    url_2 = 'https://en.wikipedia.org/wiki/Interesting_(sarcasm)'
    url_3 = 'https://en.wikipedia.org/wiki/Foo'
    elements = set()

    monkeypatch.setattr('time.time', lambda: 0)
    waittime = u.add(origin_1, set([url_1, url_2, url_3]))
    # pytest.set_trace()
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime)
    elements.add(u.pop())
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime + 10)
    elements.add(u.pop())
    assert u.pop() == None

    monkeypatch.setattr('time.time', lambda: waittime + 20)
    elements.add(u.pop())
    assert u.pop() == None

    assert elements == set([url_1, url_2, url_3])