def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)
Exemple #2
0
 def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)
 def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('These patterns were not matched during the crawl: this_wont_be_crawled') != -1)
 def test_memory_plugin(self):
     from crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)
Exemple #5
0
 def test_memory_plugin(self):
     from crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)
Exemple #6
0
 def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(
         output.find(
             'These patterns were not matched during the crawl: this_wont_be_crawled'
         ) != -1)
 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})
 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
Exemple #9
0
class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('-p',
                    '--pdb',
                    action='store_true',
                    dest='pdb',
                    default=False,
                    help='Pass -p to drop into pdb on an error'),
        make_option('-d',
                    '--depth',
                    action='store',
                    dest='depth',
                    default=3,
                    help='Specify the depth to crawl.'),
        make_option('-s',
                    '--safe',
                    action='store_true',
                    dest='html',
                    default=False,
                    help='Pass -s to check for html fragments in your pages.'),
        make_option('-r',
                    '--response',
                    action='store_true',
                    dest='response',
                    default=False,
                    help='Pass -r to store the response objects.'),
        make_option('-t',
                    '--time',
                    action='store_true',
                    dest='time',
                    default=False,
                    help='Pass -t to time your requests.'),
        make_option('--enable-plugin',
                    action='append',
                    dest='plugins',
                    default=[],
                    help='Enable the specified plugin'),
        make_option(
            "-o",
            '--output-dir',
            action='store',
            dest='output_dir',
            default=None,
            help='If specified, store plugin output in the provided directory'
        ),
        make_option(
            '--no-parent',
            action='store_true',
            dest="no_parent",
            default=False,
            help='Do not crawl URLs which do not start with your base URL'),
        make_option(
            '-a',
            "--auth",
            action='store',
            dest='auth',
            default=None,
            help=
            'Authenticate before crawl. Example: --auth username:foo,password:bar'
        ))

    help = "Displays all of the url matching routes for the project."
    args = "[relative start url]"

    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', 1))
        depth = int(options.get('depth', 3))

        auth = _parse_auth(options.get('auth'))

        if verbosity == 3:
            log_level = 1
        elif verbosity == 2:
            log_level = logging.DEBUG
        elif verbosity:
            log_level = logging.INFO
        else:
            log_level = logging.WARN

        crawl_logger = logging.getLogger('crawler')
        crawl_logger.setLevel(logging.DEBUG)
        crawl_logger.propagate = 0

        log_stats = LogStatsHandler()

        crawl_logger.addHandler(log_stats)

        console = logging.StreamHandler()
        console.setLevel(log_level)
        console.setFormatter(
            logging.Formatter(
                "%(name)s [%(levelname)s] %(module)s: %(message)s"))

        crawl_logger.addHandler(console)

        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if settings.ADMIN_FOR:
            settings_modules = [
                __import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR
            ]
        else:
            settings_modules = [settings]

        conf_urls = {}

        # Build the list URLs to test from urlpatterns:
        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception, e:
                logging.exception("Error occurred while trying to load %s: %s",
                                  settings_mod.ROOT_URLCONF, str(e))
                continue

            view_functions = extract_views_from_urlpatterns(
                urlconf.urlpatterns)
            for (func, regex) in view_functions:
                #Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func,
                                    '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

        c = Crawler(
            start_url,
            conf_urls=conf_urls,
            verbosity=verbosity,
            output_dir=options.get("output_dir"),
            ascend=not options.get("no_parent"),
            auth=auth,
        )

        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__(
                            "crawler.plugins.%s" % p,
                            fromlist=["crawler.plugins"])
                    else:
                        raise

                c.plugins.append(plugin_module.PLUGIN())
            except (ImportError, AttributeError), e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)
Exemple #10
0
 def __init__(self):
     self.file_start_time = datetime.now()
     self.time_delta = timedelta(hours=6)
     self.base_url = "http://pinterest.com"
     Crawler.__init__(self)
Exemple #11
0
 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})
Exemple #12
0
 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})
Exemple #13
0
 def __init__(self):
     self.base_url = "http://www.pinterest.com"
     links = ["http://www.pinterest.com", "http://www.pinterest.com/all/"]
     Crawler.__init__(self, links)