Example #1
0
def test_discover_urls():
    """ Test that hyperlinks are discovered or ignored correctly. """
    import queue, threading
    f = Fetch(url_queue=queue.Queue(), url_store=dict(), url_lock=threading.Lock())
    body = """
    <html><body>
    <a href='/a' />
    <a href='/b' />
    <a href='./c' />
    <a href='../d' />
    <a href='http://othersite/' />
    <a href='http://othersite/e' />
    <a href='http://othersite/e#foo' />
    <a href='mailto:[email protected]' />
    <a not always as it seems! />
    </body></html>
    """
    urls = f.discover_urls('http://nonexistant/x/', body)
    eq_(urls, ['http://nonexistant/a',
               'http://nonexistant/b',
               'http://nonexistant/x/c',
               'http://nonexistant/d',
               'http://othersite/',
               'http://othersite/e',
               'http://othersite/e#foo',
               ])
Example #2
0
def test_stop_fetch():
    """ Test that Fetch can be started and stopped correctly. """
    import queue, threading
    iq = queue.Queue()
    iq.put('http://nonexistant')
    f = Fetch(url_queue=iq, url_store=dict(), url_lock=threading.Lock())
    eq_(f.isAlive(), False)
    f.start()
    eq_(f.isAlive(), True)
    eq_(f._force_ending, False)
    f.stop()
    eq_(f._force_ending, True)
    f.join()
    eq_(f.isAlive(), False)
Example #3
0
def test_store_urldata():
    """ Test that hyperlink relationships are modelled correctly. """
    import queue, threading
    f = Fetch(url_queue=queue.Queue(), url_store=dict(), url_lock=threading.Lock())
    test_dict = dict()
    o_urls = ['http://nonexistant/a',
            'http://nonexistant/b',
            'http://nonexistant/x/c',
            'http://nonexistant/d',
            'http://othersite/',
            'http://othersite/e',
            'http://othersite/e#foo',
            ]
    f.store_urldata(current_url='http://nonexistant/x/',
            outgoing_urls=o_urls, store=test_dict)
    eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls))
    eq_(len(test_dict['http://nonexistant/x/']['incoming']), 0)
    eq_(len(test_dict['http://nonexistant/d']['incoming']), 1)
    eq_(len(test_dict), len(o_urls) + 1)
    # 'd' points back to 'x', should increment x's "incoming" to 1
    f.store_urldata(current_url='http://nonexistant/d',
            outgoing_urls=['http://nonexistant/x/',], store=test_dict)
    eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls))
    eq_(len(test_dict['http://nonexistant/x/']['incoming']), 1)  # *
    eq_(len(test_dict['http://nonexistant/d']['incoming']), 1)
    eq_(len(test_dict), len(o_urls) + 1)
    # exact same entry added again; no change
    f.store_urldata(current_url='http://nonexistant/d',
            outgoing_urls=['http://nonexistant/x/',], store=test_dict)
    eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls))
    eq_(len(test_dict['http://nonexistant/x/']['incoming']), 1)  # *
    eq_(len(test_dict['http://nonexistant/d']['incoming']), 1)
    eq_(len(test_dict), len(o_urls) + 1)