Exemple #1
0
def test_main_process_keyboard():
    p = Patu(urls=['www.djangoproject.com'], depth=1)
    def ctrl_c():
        raise KeyboardInterrupt
    p.process_next_url = ctrl_c
    p.crawl()
    eq_(p.seen_urls, set([]))
Exemple #2
0
def test_initial_redirect():
    p = Patu(urls=['redirect.me'], depth=2)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
    p = Patu(urls=['djangoproject.com'], depth=2)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #3
0
def test_generate():

    with open('.test_generated.txt', 'w') as f:
        s = sys.stdout
        sys.stdout = f

        p = Patu(urls=[TEST_URL], depth=1, generate=True)
        p.crawl()

        sys.stdout = s
    with open('.test_generated.txt', 'r') as f:
        generated_urls = f.read().strip()
    remove('.test_generated.txt')
    correct_urls = """
http://www.djangoproject.com	None
http://www.djangoproject.com/weblog/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/22/django-1_2-release-schedule-update-6/	http://www.djangoproject.com
http://www.djangoproject.com/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/28/django-1_2-release-schedule-update-7/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/may/05/12-rc-1/	http://www.djangoproject.com
http://www.djangoproject.com/weblog/2010/apr/14/django-1_2-release-schedule-update-5/	http://www.djangoproject.com
http://www.djangoproject.com/foundation/	http://www.djangoproject.com
http://www.djangoproject.com/community/	http://www.djangoproject.com
http://www.djangoproject.com/download/	http://www.djangoproject.com
"""
    correct_urls = correct_urls.strip()
    eq_(generated_urls, correct_urls)
Exemple #4
0
def main():
    parser = OptionParser()
    options_a = [
        ["-s", "--spiders", dict(dest="spiders", type="int", default=1, help="sends more than one spider")],
        ["-S", "--nospinner", dict(dest="spinner", action="store_false", default=True, help="turns off the spinner")],
        ["-v", "--verbose", dict(dest="verbose", action="store_true", default=False, help="outputs every request (implies --nospiner)")],
        ["-d", "--depth", dict(dest="depth", type="int", default=-1, help="does a breadth-first crawl, stopping after DEPTH levels")],
        ['-g', '--generate', dict(dest='generate', action='store_true', default=False, help='generate a list of crawled URLs on stdout')],
        ['-i', '--input', dict(dest='input_file', type='str', default='', help='file of URLs to crawl')],
    ]
    for s, l, k in options_a:
        parser.add_option(s, l, **k)
    (options, args) = parser.parse_args()
     # Submit first url
    urls = [unicode(url) for url in args]
    kwargs = {
        'urls': urls,
        'spiders': options.spiders,
        'spinner': options.spinner,
        'verbose': options.verbose,
        'depth': options.depth,
        'generate': options.generate,
        'input_file': options.input_file
    }
    spider = Patu(**kwargs)
    spider.crawl()
    print
Exemple #5
0
def test_stdin():
    with open(TEST_INPUT) as f:
        s = sys.stdin
        sys.stdin = f

        p = Patu(depth=1, input_file='-', verbose=True)
        p.crawl()

        sys.stdin = s
    eq_(p.seen_urls, SEEN_URLS)
Exemple #6
0
def test_error():
    with open('.test_generated.txt', 'w') as f:
        s = sys.stdout
        sys.stdout = f

        p = Patu(urls=['error.me'], depth=1)
        p.crawl()

        sys.stdout = s
    with open('.test_generated.txt', 'r') as f:
        eq_(f.read().strip(), '[500] http://error.me (from None)')
Exemple #7
0
def test_no_http():
    p = Patu(urls=['www.djangoproject.com'], depth=1)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #8
0
def test_file_input():
    p = Patu(depth=1, input_file=TEST_INPUT)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)
Exemple #9
0
def test_crawl():
    p = Patu(urls=[TEST_URL], depth=1)
    p.crawl()
    eq_(p.seen_urls, SEEN_URLS)