Exemple #1
0
def test_main_process_keyboard():
    p = Patu(urls=['www.djangoproject.com'], depth=1)
    def ctrl_c():
        raise KeyboardInterrupt
    p.process_next_url = ctrl_c
    p.crawl()
    eq_(p.seen_urls, set([]))
Exemple #2
0
def test_generate():

    with open('.test_generated.txt', 'w') as f:
        s = sys.stdout
        sys.stdout = f

        p = Patu(urls=[TEST_URL], depth=1, generate=True)
        p.crawl()

        sys.stdout = s
    with open('.test_generated.txt', 'r') as f:
        generated_urls = f.read().strip()
    remove('.test_generated.txt')
    correct_urls = """
http://www.djangoproject.com	None
http://www.djangoproject.com/weblog/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/22/django-1_2-release-schedule-update-6/	http://www.djangoproject.com
http://www.djangoproject.com/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/28/django-1_2-release-schedule-update-7/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/may/05/12-rc-1/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/14/django-1_2-release-schedule-update-5/	http://www.djangoproject.com
http://www.djangoproject.com/foundation/	http://www.djangoproject.com
http://www.djangoproject.com/community/	http://www.djangoproject.com
http://www.djangoproject.com/download/	http://www.djangoproject.com
"""
    correct_urls = correct_urls.strip()
    eq_(generated_urls, correct_urls)
Exemple #3
0
def main():
    parser = OptionParser()
    options_a = [
        ["-s", "--spiders", dict(dest="spiders", type="int", default=1, help="sends more than one spider")],
        ["-S", "--nospinner", dict(dest="spinner", action="store_false", default=True, help="turns off the spinner")],
        ["-v", "--verbose", dict(dest="verbose", action="store_true", default=False, help="outputs every request (implies --nospiner)")],
        ["-d", "--depth", dict(dest="depth", type="int", default=-1, help="does a breadth-first crawl, stopping after DEPTH levels")],
        ['-g', '--generate', dict(dest='generate', action='store_true', default=False, help='generate a list of crawled URLs on stdout')],
        ['-i', '--input', dict(dest='input_file', type='str', default='', help='file of URLs to crawl')],
    ]
    for s, l, k in options_a:
        parser.add_option(s, l, **k)
    (options, args) = parser.parse_args()
     # Submit first url
    urls = [unicode(url) for url in args]
    kwargs = {
        'urls': urls,
        'spiders': options.spiders,
        'spinner': options.spinner,
        'verbose': options.verbose,
        'depth': options.depth,
        'generate': options.generate,
        'input_file': options.input_file
    }
    spider = Patu(**kwargs)
    spider.crawl()
    print
Exemple #4
0
def test_worker():
    p = Patu(urls=['www.djangoproject.com'], depth=1)
    for url, referer in p.next_urls.iteritems():
        p.task_queue.put(url)
    p.task_queue.put('STOP')
    p.worker()
    content = p.done_queue.get().content

    with open(TEST_HTML) as f:
        eq_(f.read(), content)
Exemple #5
0
def test_stdin():
    with open(TEST_INPUT) as f:
        s = sys.stdin
        sys.stdin = f

        p = Patu(depth=1, input_file='-', verbose=True)
        p.crawl()

        sys.stdin = s
    eq_(p.seen_urls, SEEN_URLS)
Exemple #6
0
def test_error():
    with open('.test_generated.txt', 'w') as f:
        s = sys.stdout
        sys.stdout = f

        p = Patu(urls=['error.me'], depth=1)
        p.crawl()

        sys.stdout = s
    with open('.test_generated.txt', 'r') as f:
        eq_(f.read().strip(), '[500] http://error.me (from None)')
Exemple #7
0
def test_worker_input_file():
    p = Patu(urls=['www.djangoproject.com'], depth=1, input_file=TEST_INPUT)
    for url, referer in p.next_urls.iteritems():
        p.task_queue.put(url)
    p.task_queue.put('STOP')
    p.worker()
    p.done_queue.put('STOP')
    for u in iter(p.done_queue.get, 'STOP'):
        try:
            url = u.url
        except AttributeError:
            url = False
        assert url in SEEN_URLS or not url
Exemple #8
0
def test_initial_redirect():
    p = Patu(urls=['redirect.me'], depth=2)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
    p = Patu(urls=['djangoproject.com'], depth=2)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #9
0
def test_worker_statuses():
    """
    This is kind of wanking - just trying to get test coverage in the worker
    processes
    """
    url_statuses = [
        ('www.djangoproject.com/offsite_redirect', 200),
        ('error.me', 500),
        ('io.me', -1),
        ('keyboard.me', -1)
        ]

    for address, error_code in url_statuses:
        p = Patu(urls=[address], depth=1)
        for url, referer in p.next_urls.iteritems():
            p.task_queue.put(url)
        p.task_queue.put('STOP')
        p.worker()
        u = p.done_queue.get()
        eq_(u.status_code, error_code)
Exemple #10
0
def test_redirect():
    p = Patu(urls=['www.djangoproject.com'])
    r = p.get_urls(MockHttp(), 'http://www.djangoproject.com/offsite_redirect')
    eq_(r.url, 'http://www.djangoproject.com/offsite_redirect')
    eq_(r.links, [])
    eq_(r.status_code, 200)
Exemple #11
0
def test_no_http():
    p = Patu(urls=['www.djangoproject.com'], depth=1)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #12
0
def test_file_input():
    p = Patu(depth=1, input_file=TEST_INPUT)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #13
0
def test_crawl():
    p = Patu(urls=[TEST_URL], depth=1)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #14
0
def test_parse_html():
    p = Patu(urls=[TEST_URL])
    r = p.get_urls(MockHttp(), TEST_URL)
    eq_(r.links, LINKS)