def test_run(path, links_all, links_unvisited, httpbin, recurse): c = Crawler(httpbin.url) c.run(url=path, recurse=recurse[0], throttle=recurse[1]) assert c.links.keys() == links_all.keys() for k, v in c.links.items(): assert 'soup' in v if recurse[0] is False: assert 'visited' not in v else: if k.startswith('http'): assert 'visited' not in v else: assert 'visited' in v if recurse[0] is False: assert sorted(list(c.unvisited_links)) == sorted(links_unvisited) else: assert list(c.unvisited_links) == []
def main(): parser = argparse.ArgumentParser( description='Generate an XML sitemap for a domain' ) parser.add_argument( 'domain', type=str, help='domain to crawl' ) parser.add_argument('-f', '--file', help='write the xml to a file') parser.add_argument( '-t', '--throttle', type=int, help='max time in secs to wait between requesting URLs' ) parser.add_argument( '-l', '--limit', type=int, help='max number of URLs to crawl' ) parser.add_argument( '-q', '--quiet', action='store_true' ) args = parser.parse_args() crawler = Crawler(args.domain, args.quiet, args.throttle, args.limit) if args.file: sitemap = Sitemap(args.file, args.quiet) crawler.run(recurse=True) if sitemap: sitemap.run(crawler.domain_links)