Beispiel #1
0
def test_run(path, links_all, links_unvisited, httpbin, recurse):
    c = Crawler(httpbin.url)
    c.run(url=path, recurse=recurse[0], throttle=recurse[1])

    assert c.links.keys() == links_all.keys()
    for k, v in c.links.items():
        assert 'soup' in v
        if recurse[0] is False:
            assert 'visited' not in v
        else:
            if k.startswith('http'):
                assert 'visited' not in v
            else:
                assert 'visited' in v

    if recurse[0] is False:
        assert sorted(list(c.unvisited_links)) == sorted(links_unvisited)
    else:
        assert list(c.unvisited_links) == []
def test_run(path, links_all, links_unvisited, httpbin, recurse):
    c = Crawler(httpbin.url)
    c.run(url=path, recurse=recurse[0], throttle=recurse[1])

    assert c.links.keys() == links_all.keys()
    for k, v in c.links.items():
        assert 'soup' in v
        if recurse[0] is False:
            assert 'visited' not in v
        else:
            if k.startswith('http'):
                assert 'visited' not in v
            else:
                assert 'visited' in v

    if recurse[0] is False:
        assert sorted(list(c.unvisited_links)) == sorted(links_unvisited)
    else:
        assert list(c.unvisited_links) == []
def main():
    parser = argparse.ArgumentParser(
        description='Generate an XML sitemap for a domain'
    )
    parser.add_argument(
        'domain',
        type=str,
        help='domain to crawl'
    )
    parser.add_argument('-f', '--file', help='write the xml to a file')
    parser.add_argument(
        '-t',
        '--throttle',
        type=int,
        help='max time in secs to wait between requesting URLs'
    )
    parser.add_argument(
        '-l',
        '--limit',
        type=int,
        help='max number of URLs to crawl'
    )
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true'
    )
    args = parser.parse_args()

    crawler = Crawler(args.domain, args.quiet, args.throttle, args.limit)
    if args.file:
        sitemap = Sitemap(args.file, args.quiet)

    crawler.run(recurse=True)
    if sitemap:
        sitemap.run(crawler.domain_links)