def test_main_process_keyboard(): p = Patu(urls=['www.djangoproject.com'], depth=1) def ctrl_c(): raise KeyboardInterrupt p.process_next_url = ctrl_c p.crawl() eq_(p.seen_urls, set([]))
def test_generate(): with open('.test_generated.txt', 'w') as f: s = sys.stdout sys.stdout = f p = Patu(urls=[TEST_URL], depth=1, generate=True) p.crawl() sys.stdout = s with open('.test_generated.txt', 'r') as f: generated_urls = f.read().strip() remove('.test_generated.txt') correct_urls = """ http://www.djangoproject.com None http://www.djangoproject.com/weblog/ http://www.djangoproject.com http://www.djangoproject.com/weblog/2010/apr/22/django-1_2-release-schedule-update-6/ http://www.djangoproject.com http://www.djangoproject.com/ http://www.djangoproject.com http://www.djangoproject.com/weblog/2010/apr/28/django-1_2-release-schedule-update-7/ http://www.djangoproject.com http://www.djangoproject.com/weblog/2010/may/05/12-rc-1/ http://www.djangoproject.com http://www.djangoproject.com/weblog/2010/apr/14/django-1_2-release-schedule-update-5/ http://www.djangoproject.com http://www.djangoproject.com/foundation/ http://www.djangoproject.com http://www.djangoproject.com/community/ http://www.djangoproject.com http://www.djangoproject.com/download/ http://www.djangoproject.com """ correct_urls = correct_urls.strip() eq_(generated_urls, correct_urls)
def main(): parser = OptionParser() options_a = [ ["-s", "--spiders", dict(dest="spiders", type="int", default=1, help="sends more than one spider")], ["-S", "--nospinner", dict(dest="spinner", action="store_false", default=True, help="turns off the spinner")], ["-v", "--verbose", dict(dest="verbose", action="store_true", default=False, help="outputs every request (implies --nospiner)")], ["-d", "--depth", dict(dest="depth", type="int", default=-1, help="does a breadth-first crawl, stopping after DEPTH levels")], ['-g', '--generate', dict(dest='generate', action='store_true', default=False, help='generate a list of crawled URLs on stdout')], ['-i', '--input', dict(dest='input_file', type='str', default='', help='file of URLs to crawl')], ] for s, l, k in options_a: parser.add_option(s, l, **k) (options, args) = parser.parse_args() # Submit first url urls = [unicode(url) for url in args] kwargs = { 'urls': urls, 'spiders': options.spiders, 'spinner': options.spinner, 'verbose': options.verbose, 'depth': options.depth, 'generate': options.generate, 'input_file': options.input_file } spider = Patu(**kwargs) spider.crawl() print
def test_worker(): p = Patu(urls=['www.djangoproject.com'], depth=1) for url, referer in p.next_urls.iteritems(): p.task_queue.put(url) p.task_queue.put('STOP') p.worker() content = p.done_queue.get().content with open(TEST_HTML) as f: eq_(f.read(), content)
def test_stdin(): with open(TEST_INPUT) as f: s = sys.stdin sys.stdin = f p = Patu(depth=1, input_file='-', verbose=True) p.crawl() sys.stdin = s eq_(p.seen_urls, SEEN_URLS)
def test_error(): with open('.test_generated.txt', 'w') as f: s = sys.stdout sys.stdout = f p = Patu(urls=['error.me'], depth=1) p.crawl() sys.stdout = s with open('.test_generated.txt', 'r') as f: eq_(f.read().strip(), '[500] http://error.me (from None)')
def test_worker_input_file(): p = Patu(urls=['www.djangoproject.com'], depth=1, input_file=TEST_INPUT) for url, referer in p.next_urls.iteritems(): p.task_queue.put(url) p.task_queue.put('STOP') p.worker() p.done_queue.put('STOP') for u in iter(p.done_queue.get, 'STOP'): try: url = u.url except AttributeError: url = False assert url in SEEN_URLS or not url
def test_initial_redirect(): p = Patu(urls=['redirect.me'], depth=2) p.crawl() eq_(p.seen_urls, SEEN_URLS) p = Patu(urls=['djangoproject.com'], depth=2) p.crawl() eq_(p.seen_urls, SEEN_URLS)
def test_worker_statuses(): """ This is kind of wanking - just trying to get test coverage in the worker processes """ url_statuses = [ ('www.djangoproject.com/offsite_redirect', 200), ('error.me', 500), ('io.me', -1), ('keyboard.me', -1) ] for address, error_code in url_statuses: p = Patu(urls=[address], depth=1) for url, referer in p.next_urls.iteritems(): p.task_queue.put(url) p.task_queue.put('STOP') p.worker() u = p.done_queue.get() eq_(u.status_code, error_code)
def test_redirect(): p = Patu(urls=['www.djangoproject.com']) r = p.get_urls(MockHttp(), 'http://www.djangoproject.com/offsite_redirect') eq_(r.url, 'http://www.djangoproject.com/offsite_redirect') eq_(r.links, []) eq_(r.status_code, 200)
def test_no_http(): p = Patu(urls=['www.djangoproject.com'], depth=1) p.crawl() eq_(p.seen_urls, SEEN_URLS)
def test_file_input(): p = Patu(depth=1, input_file=TEST_INPUT) p.crawl() eq_(p.seen_urls, SEEN_URLS)
def test_crawl(): p = Patu(urls=[TEST_URL], depth=1) p.crawl() eq_(p.seen_urls, SEEN_URLS)
def test_parse_html(): p = Patu(urls=[TEST_URL]) r = p.get_urls(MockHttp(), TEST_URL) eq_(r.links, LINKS)