def test_discover_urls(): """ Test that hyperlinks are discovered or ignored correctly. """ import queue, threading f = Fetch(url_queue=queue.Queue(), url_store=dict(), url_lock=threading.Lock()) body = """ <html><body> <a href='/a' /> <a href='/b' /> <a href='./c' /> <a href='../d' /> <a href='http://othersite/' /> <a href='http://othersite/e' /> <a href='http://othersite/e#foo' /> <a href='mailto:[email protected]' /> <a not always as it seems! /> </body></html> """ urls = f.discover_urls('http://nonexistant/x/', body) eq_(urls, ['http://nonexistant/a', 'http://nonexistant/b', 'http://nonexistant/x/c', 'http://nonexistant/d', 'http://othersite/', 'http://othersite/e', 'http://othersite/e#foo', ])
def test_stop_fetch(): """ Test that Fetch can be started and stopped correctly. """ import queue, threading iq = queue.Queue() iq.put('http://nonexistant') f = Fetch(url_queue=iq, url_store=dict(), url_lock=threading.Lock()) eq_(f.isAlive(), False) f.start() eq_(f.isAlive(), True) eq_(f._force_ending, False) f.stop() eq_(f._force_ending, True) f.join() eq_(f.isAlive(), False)
def test_store_urldata(): """ Test that hyperlink relationships are modelled correctly. """ import queue, threading f = Fetch(url_queue=queue.Queue(), url_store=dict(), url_lock=threading.Lock()) test_dict = dict() o_urls = ['http://nonexistant/a', 'http://nonexistant/b', 'http://nonexistant/x/c', 'http://nonexistant/d', 'http://othersite/', 'http://othersite/e', 'http://othersite/e#foo', ] f.store_urldata(current_url='http://nonexistant/x/', outgoing_urls=o_urls, store=test_dict) eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls)) eq_(len(test_dict['http://nonexistant/x/']['incoming']), 0) eq_(len(test_dict['http://nonexistant/d']['incoming']), 1) eq_(len(test_dict), len(o_urls) + 1) # 'd' points back to 'x', should increment x's "incoming" to 1 f.store_urldata(current_url='http://nonexistant/d', outgoing_urls=['http://nonexistant/x/',], store=test_dict) eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls)) eq_(len(test_dict['http://nonexistant/x/']['incoming']), 1) # * eq_(len(test_dict['http://nonexistant/d']['incoming']), 1) eq_(len(test_dict), len(o_urls) + 1) # exact same entry added again; no change f.store_urldata(current_url='http://nonexistant/d', outgoing_urls=['http://nonexistant/x/',], store=test_dict) eq_(len(test_dict['http://nonexistant/x/']['outgoing']), len(o_urls)) eq_(len(test_dict['http://nonexistant/x/']['incoming']), 1) # * eq_(len(test_dict['http://nonexistant/d']['incoming']), 1) eq_(len(test_dict), len(o_urls) + 1)