def test_url_plugin(self): conf_urls = {'this_wont_be_crawled': True} c = Crawler('/', conf_urls=conf_urls) c.run() logs = open('crawler_log') output = logs.read() self.assertTrue(output.find('These patterns were not matched during the crawl: this_wont_be_crawled') != -1)
def test_time_plugin(self): #This isn't testing much, but I can't know how long the time will take c = Crawler('/') c.run() logs = open('crawler_log') output = logs.read() self.assertTrue(output.find('Time taken:') != -1)
def test_memory_plugin(self): from test_utils.crawler.plugins.memory_plugin import Memory Memory.active = True c = Crawler('/') c.run() logs = open('crawler_log') output = logs.read() self.assertTrue(output.find('Memory consumed:') != -1)
def test_url_plugin(self): conf_urls = {'this_wont_be_crawled': True} c = Crawler('/', conf_urls=conf_urls) c.run() logs = open('crawler_log') output = logs.read() self.assertTrue( output.find( 'These patterns were not matched during the crawl: this_wont_be_crawled' ) != -1)
def test_relative_crawling(self): conf_urls = {} verbosity = 1 c = Crawler('/1', conf_urls=conf_urls, verbosity=verbosity) c.run() self.assertEqual(c.crawled, {u'/1': True})
def test_basic_crawling(self): conf_urls = {} verbosity = 1 c = Crawler('/', conf_urls=conf_urls, verbosity=verbosity) c.run() self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
def test_relative_crawling(self): c = Crawler('/1') c.run() self.assertEqual(c.crawled, {u'/1': True})
def test_basic_crawling(self): c = Crawler('/') c.run() self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
def handle(self, *args, **options): verbosity = int(options.get('verbosity', 1)) depth = int(options.get('depth', 3)) auth = _parse_auth(options.get('auth')) if verbosity > 1: log_level = logging.DEBUG elif verbosity: log_level = logging.INFO else: log_level = logging.WARN crawl_logger = logging.getLogger('crawler') crawl_logger.setLevel(logging.DEBUG) crawl_logger.propagate = 0 log_stats = LogStatsHandler() crawl_logger.addHandler(log_stats) console = logging.StreamHandler() console.setLevel(log_level) console.setFormatter( logging.Formatter( "%(name)s [%(levelname)s] %(module)s: %(message)s")) crawl_logger.addHandler(console) if len(args) > 1: raise CommandError('Only one start url is currently supported.') else: start_url = args[0] if args else '/' if getattr(settings, 'ADMIN_FOR', None): settings_modules = [ __import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR ] else: settings_modules = [settings] conf_urls = {} # Build the list URLs to test from urlpatterns: for settings_mod in settings_modules: try: urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, ['']) except Exception as e: logging.exception("Error occurred while trying to load %s: %s", settings_mod.ROOT_URLCONF, str(e)) continue view_functions = extract_views_from_urlpatterns( urlconf.urlpatterns) for (func, regex, namespace, name) in view_functions: # Get function name and add it to the hash of URLConf urls func_name = hasattr(func, '__name__') and func.__name__ or repr(func) conf_urls[regex] = ['func.__module__', func_name] c = Crawler( start_url, conf_urls=conf_urls, verbosity=verbosity, output_dir=options.get("output_dir"), ascend=not options.get("no_parent"), auth=auth, ) # Load plugins: for p in options['plugins']: # This nested try is somewhat unsightly but allows easy Pythonic # usage ("--enable-plugin=tidy") instead of Java-esque # "--enable-plugin=test_utils.crawler.plugins.tidy" try: try: plugin_module = __import__(p) except ImportError: if not "." in p: plugin_module = __import__( "test_utils.crawler.plugins.%s" % p, fromlist=["test_utils.crawler.plugins"]) else: raise c.plugins.append(plugin_module.PLUGIN()) except (ImportError, AttributeError) as e: crawl_logger.critical("Unable to load plugin %s: %s", p, e) sys.exit(3) c.run(max_depth=depth) # We'll exit with a non-zero status if we had any errors max_log_level = max(log_stats.stats.keys()) if max_log_level >= logging.ERROR: sys.exit(2) elif max_log_level >= logging.WARNING: sys.exit(1) else: sys.exit(0)
# Load plugins: for p in options['plugins']: # This nested try is somewhat unsightly but allows easy Pythonic # usage ("--enable-plugin=tidy") instead of Java-esque # "--enable-plugin=test_utils.crawler.plugins.tidy" try: try: plugin_module = __import__(p) except ImportError: if not "." in p: plugin_module = __import__("test_utils.crawler.plugins.%s" % p) else: raise plugin_module.active = True except ImportError, e: crawl_logger.critical("Unable to load plugin %s: %s", p, e) sys.exit(3) c = Crawler(start_url, conf_urls=conf_urls, verbosity=verbosity) c.run(max_depth=depth) # We'll exit with a non-zero status if we had any errors max_log_level = max(log_stats.stats.keys()) if max_log_level >= logging.ERROR: sys.exit(2) elif max_log_level >= logging.WARNING: sys.exit(1) else: sys.exit(0)