def main(): ''' Main program: parse args, read config, set up event loop, run the crawler. ''' args = ARGS.parse_args() if args.printdefault: conf.print_default() sys.exit(1) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.loglevel, len(levels) - 1)]) config = conf.config(args.configfile, args.config, confighome=not args.no_confighome) limit_resources(config) kwargs = {} if args.load: kwargs['load'] = args.load if args.no_test: kwargs['no_test'] = True loop = asyncio.get_event_loop() crawler = cocrawler.Crawler(loop, config, **kwargs) if config.get('CarbonStats'): timer.start_carbon(loop, config) if config['REST']: app = webserver.make_app(loop, config) else: app = None try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupt. Exiting cleanly.\n') crawler.cancel_workers() finally: crawler.close() if app: webserver.close(app) if config.get('CarbonStats'): timer.close() # apparently this is needed for full aiohttp cleanup loop.stop() loop.run_forever() loop.close()
def test_cocrawler(capsys): config = conf.config(None, None, confighome=False) # ok, we have to get around the useragent checks config['UserAgent']['MyPrefix'] = 'pytest' config['UserAgent']['URL'] = 'http://example.com/pytest-test-cocrawler.py' loop = asyncio.get_event_loop() crawler = cocrawler.Crawler(loop, config) crawler.add_url(0, URL('http://example1.com/'), seed=True) crawler.add_url(0, URL('http://example2.com/'), seed=True) crawler.add_url(0, URL('http://example3.com/'), seed=True) assert crawler.qsize == 3 f = tempfile.NamedTemporaryFile(delete=False) name = f.name with open(name, 'wb') as f: crawler.save(f) assert crawler.qsize == 0 crawler.add_url(0, URL('http://example4.com/'), seed=True) assert crawler.qsize == 1 with open(name, 'rb') as f: crawler.load(f) assert crawler.qsize == 3 os.unlink(name) assert not os.path.exists(name) # clear out the existing capture out, err = capsys.readouterr() crawler.summarize() out, err = capsys.readouterr() assert err == '' assert len(out) >= 242 # not a very good test, but at least it is something
import aiohttp import cocrawler.conf as conf import cocrawler.dns as dns ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark') ARGS.add_argument('--config', action='append') ARGS.add_argument('--configfile', action='store') ARGS.add_argument('--no-confighome', action='store_true') ARGS.add_argument('--count', type=int, default=1000) args = ARGS.parse_args() config = conf.config(args.configfile, args.config, confighome=not args.no_confighome) max_workers = config['Crawl']['MaxWorkers'] ns = config['Fetcher'].get('Nameservers') if not isinstance(ns, list): ns = [ns] #resolver = aiohttp.resolver.AsyncResolver(nameservers=ns) # Can I pass rotate=True into this? #connector = aiohttp.connector.TCPConnector(resolver=resolver, family=socket.AF_INET) #session = aiohttp.ClientSession(connector=connector) exit_value = 0 dns.setup_resolver(ns)