def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('These patterns were not matched during the crawl: this_wont_be_crawled') != -1)
 def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)
Ejemplo n.º 3
0
 def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)
 def test_memory_plugin(self):
     from test_utils.crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)
Ejemplo n.º 5
0
 def test_memory_plugin(self):
     from test_utils.crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)
Ejemplo n.º 6
0
 def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(
         output.find(
             'These patterns were not matched during the crawl: this_wont_be_crawled'
         ) != -1)
Ejemplo n.º 7
0
 def test_relative_crawling(self):
     conf_urls = {}
     verbosity = 1
     c = Crawler('/1', conf_urls=conf_urls, verbosity=verbosity)
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})
Ejemplo n.º 8
0
 def test_basic_crawling(self):
     conf_urls = {}
     verbosity = 1
     c = Crawler('/', conf_urls=conf_urls, verbosity=verbosity)
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})
 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
Ejemplo n.º 11
0
 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})
Ejemplo n.º 12
0
 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
Ejemplo n.º 13
0
    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', 1))
        depth = int(options.get('depth', 3))

        auth = _parse_auth(options.get('auth'))

        if verbosity > 1:
            log_level = logging.DEBUG
        elif verbosity:
            log_level = logging.INFO
        else:
            log_level = logging.WARN

        crawl_logger = logging.getLogger('crawler')
        crawl_logger.setLevel(logging.DEBUG)
        crawl_logger.propagate = 0

        log_stats = LogStatsHandler()

        crawl_logger.addHandler(log_stats)

        console = logging.StreamHandler()
        console.setLevel(log_level)
        console.setFormatter(
            logging.Formatter(
                "%(name)s [%(levelname)s] %(module)s: %(message)s"))

        crawl_logger.addHandler(console)

        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if getattr(settings, 'ADMIN_FOR', None):
            settings_modules = [
                __import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR
            ]
        else:
            settings_modules = [settings]

        conf_urls = {}

        # Build the list URLs to test from urlpatterns:
        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception as e:
                logging.exception("Error occurred while trying to load %s: %s",
                                  settings_mod.ROOT_URLCONF, str(e))
                continue

            view_functions = extract_views_from_urlpatterns(
                urlconf.urlpatterns)
            for (func, regex, namespace, name) in view_functions:
                # Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func,
                                    '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

        c = Crawler(
            start_url,
            conf_urls=conf_urls,
            verbosity=verbosity,
            output_dir=options.get("output_dir"),
            ascend=not options.get("no_parent"),
            auth=auth,
        )

        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=test_utils.crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__(
                            "test_utils.crawler.plugins.%s" % p,
                            fromlist=["test_utils.crawler.plugins"])
                    else:
                        raise

                c.plugins.append(plugin_module.PLUGIN())
            except (ImportError, AttributeError) as e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)

        c.run(max_depth=depth)

        # We'll exit with a non-zero status if we had any errors
        max_log_level = max(log_stats.stats.keys())
        if max_log_level >= logging.ERROR:
            sys.exit(2)
        elif max_log_level >= logging.WARNING:
            sys.exit(1)
        else:
            sys.exit(0)
Ejemplo n.º 14
0
        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=test_utils.crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__("test_utils.crawler.plugins.%s" % p)
                    else:
                        raise

                plugin_module.active = True
            except ImportError, e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)

        c = Crawler(start_url, conf_urls=conf_urls, verbosity=verbosity)
        c.run(max_depth=depth)

        # We'll exit with a non-zero status if we had any errors
        max_log_level = max(log_stats.stats.keys())
        if max_log_level >= logging.ERROR:
            sys.exit(2)
        elif max_log_level >= logging.WARNING:
            sys.exit(1)
        else:
            sys.exit(0)